entityextractor 0.0.10 → 0.0.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/entityextractor.rb +36 -14
- data/lib/handleinput.rb +138 -0
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6d73c2f909fbe862625da5f0952032715d696de2
|
4
|
+
data.tar.gz: c33ee22a6a5cbb3eb37f649c6db2df26df366c75
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f98e67a81133a74c666d474b5d22bd0018e5d941ec7bc5f875a319ea1ee3677d93ff3917591f23fdbfdb0c0e2cc14c4f866c4b1289aa8a908acf83f6f8c07791
|
7
|
+
data.tar.gz: a5a40b53bcb12550bf096cea1b4f72f63be2b254d079df89c322619041f67dfb51f3d9da9d479f422c058acbe4257e6b17dd1e8eb0834d08ba2874d4cf31c2bd
|
data/lib/entityextractor.rb
CHANGED
@@ -1,24 +1,32 @@
|
|
1
1
|
require 'json'
|
2
2
|
load 'extractdates.rb'
|
3
|
+
load 'handleinput.rb'
|
3
4
|
require 'uploadconvert'
|
4
5
|
|
5
6
|
class EntityExtractor
|
6
|
-
def initialize(input, *extractfield)
|
7
|
+
def initialize(input, fieldoutname, *extractfield)
|
7
8
|
@input = JSON.parse(input)
|
9
|
+
@fieldoutname = fieldoutname
|
8
10
|
@extractfield = *extractfield
|
9
11
|
@output = Array.new
|
10
12
|
end
|
11
13
|
|
12
14
|
# Extract terms input from preset list
|
13
|
-
def extractTerms(
|
15
|
+
def extractTerms(extractlist, i, addlist, field)
|
14
16
|
count = 0
|
17
|
+
downcased = i[field].to_s.downcase
|
15
18
|
|
16
19
|
# Check the item for each term
|
17
|
-
|
20
|
+
extractlist.each do |t, c|
|
18
21
|
count+=1
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
+
if c == true
|
23
|
+
if i[field].to_s.include? t
|
24
|
+
addlist.push(t)
|
25
|
+
end
|
26
|
+
else
|
27
|
+
if downcased.include? t.downcase
|
28
|
+
addlist.push(t)
|
29
|
+
end
|
22
30
|
end
|
23
31
|
end
|
24
32
|
end
|
@@ -70,7 +78,7 @@ class EntityExtractor
|
|
70
78
|
|
71
79
|
# Generate hash of all extracted terms
|
72
80
|
@output.each do |i|
|
73
|
-
i[
|
81
|
+
i[@fieldoutname].each do |e|
|
74
82
|
if extracthash.has_key? e
|
75
83
|
extracthash[e] += 1
|
76
84
|
else
|
@@ -88,8 +96,12 @@ class EntityExtractor
|
|
88
96
|
JSON.pretty_generate(@output)
|
89
97
|
end
|
90
98
|
|
91
|
-
def extract(type, minchar, ignoreterms,
|
99
|
+
def extract(type, minchar, ignoreterms, terms, ignorefields, caseinfo, mapto)
|
92
100
|
flag = 0
|
101
|
+
|
102
|
+
h = HandleInput.new(terms, ignorefields, caseinfo)
|
103
|
+
extractlist = h.detecttype
|
104
|
+
|
93
105
|
@input.each do |i|
|
94
106
|
if i.length == 2
|
95
107
|
i = @input
|
@@ -101,10 +113,14 @@ class EntityExtractor
|
|
101
113
|
# Generate set terms list
|
102
114
|
if type == "set"
|
103
115
|
@extractfield.each do |f|
|
104
|
-
extractTerms(
|
116
|
+
extractTerms(extractlist, i, addlist, f)
|
117
|
+
end
|
118
|
+
|
119
|
+
if mapto
|
120
|
+
i[@fieldoutname] = h.mapout(addlist, mapto)
|
121
|
+
else
|
122
|
+
i[@fieldoutname] = addlist
|
105
123
|
end
|
106
|
-
|
107
|
-
i["extract"] = addlist
|
108
124
|
@output.push(i)
|
109
125
|
|
110
126
|
# Generate ALLCAPS terms list
|
@@ -114,7 +130,7 @@ class EntityExtractor
|
|
114
130
|
parseALLCAPS(i[f].to_s, i, minchar, addlist, ignoreterms, savefield, f)
|
115
131
|
end
|
116
132
|
|
117
|
-
i[
|
133
|
+
i[@fieldoutname] = addlist
|
118
134
|
@output.push(i)
|
119
135
|
|
120
136
|
# Extract dates
|
@@ -130,10 +146,15 @@ class EntityExtractor
|
|
130
146
|
@extractfield.each do |f|
|
131
147
|
savefield = i[f].to_s + " "
|
132
148
|
parseALLCAPS(i[f].to_s, i, minchar, addlist, ignoreterms, savefield, f)
|
133
|
-
extractTerms(
|
149
|
+
extractTerms(extractlist, i, addlist, f)
|
150
|
+
end
|
151
|
+
|
152
|
+
if mapto
|
153
|
+
i[@fieldoutname] = h.mapout(addlist, mapto)
|
154
|
+
else
|
155
|
+
i[@fieldoutname] = addlist
|
134
156
|
end
|
135
157
|
|
136
|
-
i["extract"] = addlist
|
137
158
|
@output.push(i)
|
138
159
|
end
|
139
160
|
|
@@ -143,3 +164,4 @@ class EntityExtractor
|
|
143
164
|
end
|
144
165
|
end
|
145
166
|
end
|
167
|
+
|
data/lib/handleinput.rb
ADDED
@@ -0,0 +1,138 @@
|
|
1
|
+
require 'json'
|
2
|
+
|
3
|
+
class HandleInput
|
4
|
+
def initialize(json, ignorefields, caseinfo)
|
5
|
+
@json = json
|
6
|
+
if @ignorefields != nil
|
7
|
+
@ignorefields = ignorefields
|
8
|
+
else
|
9
|
+
@ignorefields = Array.new
|
10
|
+
end
|
11
|
+
|
12
|
+
@caseinfo = caseinfo
|
13
|
+
@output = Array.new
|
14
|
+
@outhash = Hash.new
|
15
|
+
end
|
16
|
+
|
17
|
+
# Map output to value
|
18
|
+
def mapout(addlist, mapto)
|
19
|
+
outarr = Array.new
|
20
|
+
|
21
|
+
addlist.each do |a|
|
22
|
+
if mapto == "key"
|
23
|
+
@json.each do |k, v|
|
24
|
+
# If it's a nested hash
|
25
|
+
if v.is_a? Hash
|
26
|
+
# Go through all values
|
27
|
+
v.each do |z, w|
|
28
|
+
# Check if k is already included
|
29
|
+
if !outarr.include? k
|
30
|
+
if w == a
|
31
|
+
outarr.push(k)
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
else
|
36
|
+
# Map for dictionaries
|
37
|
+
if !outarr.include? k
|
38
|
+
if v == a || k == a
|
39
|
+
outarr.push(k)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
else
|
45
|
+
@json.each do |k, v|
|
46
|
+
v.each do |z, w|
|
47
|
+
# Only map if not already matched
|
48
|
+
if !outarr.include? v[mapto]
|
49
|
+
# Check if vals match
|
50
|
+
if w == a
|
51
|
+
outarr.push(v[mapto])
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
return outarr
|
60
|
+
end
|
61
|
+
|
62
|
+
# Figure out which type of input it is: array, hash, hash with hash values
|
63
|
+
def detecttype
|
64
|
+
if @json.is_a? Array
|
65
|
+
@output = @json
|
66
|
+
checkCase
|
67
|
+
elsif @json.is_a? Hash
|
68
|
+
@json.each do |k, v|
|
69
|
+
if v.is_a? Hash
|
70
|
+
parseValHash
|
71
|
+
break
|
72
|
+
else
|
73
|
+
parseDictionary
|
74
|
+
break
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
return @outhash
|
80
|
+
end
|
81
|
+
|
82
|
+
# Adds case sensitive preferences
|
83
|
+
def checkCase
|
84
|
+
if @caseinfo == "casesensitive"
|
85
|
+
@output.each do |i|
|
86
|
+
@outhash[i] = true
|
87
|
+
end
|
88
|
+
elsif @caseinfo == "noncasesensitive"
|
89
|
+
@output.each do |i|
|
90
|
+
@outhash[i] = false
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
# Handle hashes where the values are a hash
|
96
|
+
def parseValHash
|
97
|
+
@json.each do |k, v|
|
98
|
+
if !@ignorefields.include? "hashkey"
|
99
|
+
if @caseinfo.include? "hashkey"
|
100
|
+
@outhash[k] = false
|
101
|
+
else
|
102
|
+
@outhash[k] = true
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
v.each do |i, j|
|
107
|
+
if !@ignorefields.include? i
|
108
|
+
if @caseinfo.include? i
|
109
|
+
@outhash[j] = false
|
110
|
+
else
|
111
|
+
@outhash[j] = true
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
# Handle hashes
|
119
|
+
def parseDictionary
|
120
|
+
@json.each do |k, v|
|
121
|
+
if !@ignorefields.include? "hashkey"
|
122
|
+
if @caseinfo.include? "hashkey"
|
123
|
+
@outhash[k] = false
|
124
|
+
else
|
125
|
+
@outhash[k] = true
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
if !@ignorefields.include? "hashval"
|
130
|
+
if @caseinfo.include? "hashval"
|
131
|
+
@outhash[v] = false
|
132
|
+
else
|
133
|
+
@outhash[v] = true
|
134
|
+
end
|
135
|
+
end
|
136
|
+
end
|
137
|
+
end
|
138
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: entityextractor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.11
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- M. C. McGrath
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-08-11 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: Extracts entities and terms from any JSON.
|
14
14
|
email: shidash@shidash.com
|
@@ -17,6 +17,7 @@ extensions: []
|
|
17
17
|
extra_rdoc_files: []
|
18
18
|
files:
|
19
19
|
- lib/extractdates.rb
|
20
|
+
- lib/handleinput.rb
|
20
21
|
- lib/entityextractor.rb
|
21
22
|
homepage: https://github.com/Shidash/EntityExtractor
|
22
23
|
licenses:
|
@@ -43,3 +44,4 @@ signing_key:
|
|
43
44
|
specification_version: 4
|
44
45
|
summary: Extracts entities and terms
|
45
46
|
test_files: []
|
47
|
+
has_rdoc:
|