entityextractor 0.0.10 → 0.0.11
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/entityextractor.rb +36 -14
- data/lib/handleinput.rb +138 -0
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6d73c2f909fbe862625da5f0952032715d696de2
|
4
|
+
data.tar.gz: c33ee22a6a5cbb3eb37f649c6db2df26df366c75
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f98e67a81133a74c666d474b5d22bd0018e5d941ec7bc5f875a319ea1ee3677d93ff3917591f23fdbfdb0c0e2cc14c4f866c4b1289aa8a908acf83f6f8c07791
|
7
|
+
data.tar.gz: a5a40b53bcb12550bf096cea1b4f72f63be2b254d079df89c322619041f67dfb51f3d9da9d479f422c058acbe4257e6b17dd1e8eb0834d08ba2874d4cf31c2bd
|
data/lib/entityextractor.rb
CHANGED
@@ -1,24 +1,32 @@
|
|
1
1
|
require 'json'
|
2
2
|
load 'extractdates.rb'
|
3
|
+
load 'handleinput.rb'
|
3
4
|
require 'uploadconvert'
|
4
5
|
|
5
6
|
class EntityExtractor
|
6
|
-
def initialize(input, *extractfield)
|
7
|
+
def initialize(input, fieldoutname, *extractfield)
|
7
8
|
@input = JSON.parse(input)
|
9
|
+
@fieldoutname = fieldoutname
|
8
10
|
@extractfield = *extractfield
|
9
11
|
@output = Array.new
|
10
12
|
end
|
11
13
|
|
12
14
|
# Extract terms input from preset list
|
13
|
-
def extractTerms(
|
15
|
+
def extractTerms(extractlist, i, addlist, field)
|
14
16
|
count = 0
|
17
|
+
downcased = i[field].to_s.downcase
|
15
18
|
|
16
19
|
# Check the item for each term
|
17
|
-
|
20
|
+
extractlist.each do |t, c|
|
18
21
|
count+=1
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
+
if c == true
|
23
|
+
if i[field].to_s.include? t
|
24
|
+
addlist.push(t)
|
25
|
+
end
|
26
|
+
else
|
27
|
+
if downcased.include? t.downcase
|
28
|
+
addlist.push(t)
|
29
|
+
end
|
22
30
|
end
|
23
31
|
end
|
24
32
|
end
|
@@ -70,7 +78,7 @@ class EntityExtractor
|
|
70
78
|
|
71
79
|
# Generate hash of all extracted terms
|
72
80
|
@output.each do |i|
|
73
|
-
i[
|
81
|
+
i[@fieldoutname].each do |e|
|
74
82
|
if extracthash.has_key? e
|
75
83
|
extracthash[e] += 1
|
76
84
|
else
|
@@ -88,8 +96,12 @@ class EntityExtractor
|
|
88
96
|
JSON.pretty_generate(@output)
|
89
97
|
end
|
90
98
|
|
91
|
-
def extract(type, minchar, ignoreterms,
|
99
|
+
def extract(type, minchar, ignoreterms, terms, ignorefields, caseinfo, mapto)
|
92
100
|
flag = 0
|
101
|
+
|
102
|
+
h = HandleInput.new(terms, ignorefields, caseinfo)
|
103
|
+
extractlist = h.detecttype
|
104
|
+
|
93
105
|
@input.each do |i|
|
94
106
|
if i.length == 2
|
95
107
|
i = @input
|
@@ -101,10 +113,14 @@ class EntityExtractor
|
|
101
113
|
# Generate set terms list
|
102
114
|
if type == "set"
|
103
115
|
@extractfield.each do |f|
|
104
|
-
extractTerms(
|
116
|
+
extractTerms(extractlist, i, addlist, f)
|
117
|
+
end
|
118
|
+
|
119
|
+
if mapto
|
120
|
+
i[@fieldoutname] = h.mapout(addlist, mapto)
|
121
|
+
else
|
122
|
+
i[@fieldoutname] = addlist
|
105
123
|
end
|
106
|
-
|
107
|
-
i["extract"] = addlist
|
108
124
|
@output.push(i)
|
109
125
|
|
110
126
|
# Generate ALLCAPS terms list
|
@@ -114,7 +130,7 @@ class EntityExtractor
|
|
114
130
|
parseALLCAPS(i[f].to_s, i, minchar, addlist, ignoreterms, savefield, f)
|
115
131
|
end
|
116
132
|
|
117
|
-
i[
|
133
|
+
i[@fieldoutname] = addlist
|
118
134
|
@output.push(i)
|
119
135
|
|
120
136
|
# Extract dates
|
@@ -130,10 +146,15 @@ class EntityExtractor
|
|
130
146
|
@extractfield.each do |f|
|
131
147
|
savefield = i[f].to_s + " "
|
132
148
|
parseALLCAPS(i[f].to_s, i, minchar, addlist, ignoreterms, savefield, f)
|
133
|
-
extractTerms(
|
149
|
+
extractTerms(extractlist, i, addlist, f)
|
150
|
+
end
|
151
|
+
|
152
|
+
if mapto
|
153
|
+
i[@fieldoutname] = h.mapout(addlist, mapto)
|
154
|
+
else
|
155
|
+
i[@fieldoutname] = addlist
|
134
156
|
end
|
135
157
|
|
136
|
-
i["extract"] = addlist
|
137
158
|
@output.push(i)
|
138
159
|
end
|
139
160
|
|
@@ -143,3 +164,4 @@ class EntityExtractor
|
|
143
164
|
end
|
144
165
|
end
|
145
166
|
end
|
167
|
+
|
data/lib/handleinput.rb
ADDED
@@ -0,0 +1,138 @@
|
|
1
|
+
require 'json'
|
2
|
+
|
3
|
+
class HandleInput
|
4
|
+
def initialize(json, ignorefields, caseinfo)
|
5
|
+
@json = json
|
6
|
+
if @ignorefields != nil
|
7
|
+
@ignorefields = ignorefields
|
8
|
+
else
|
9
|
+
@ignorefields = Array.new
|
10
|
+
end
|
11
|
+
|
12
|
+
@caseinfo = caseinfo
|
13
|
+
@output = Array.new
|
14
|
+
@outhash = Hash.new
|
15
|
+
end
|
16
|
+
|
17
|
+
# Map output to value
|
18
|
+
def mapout(addlist, mapto)
|
19
|
+
outarr = Array.new
|
20
|
+
|
21
|
+
addlist.each do |a|
|
22
|
+
if mapto == "key"
|
23
|
+
@json.each do |k, v|
|
24
|
+
# If it's a nested hash
|
25
|
+
if v.is_a? Hash
|
26
|
+
# Go through all values
|
27
|
+
v.each do |z, w|
|
28
|
+
# Check if k is already included
|
29
|
+
if !outarr.include? k
|
30
|
+
if w == a
|
31
|
+
outarr.push(k)
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
else
|
36
|
+
# Map for dictionaries
|
37
|
+
if !outarr.include? k
|
38
|
+
if v == a || k == a
|
39
|
+
outarr.push(k)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
else
|
45
|
+
@json.each do |k, v|
|
46
|
+
v.each do |z, w|
|
47
|
+
# Only map if not already matched
|
48
|
+
if !outarr.include? v[mapto]
|
49
|
+
# Check if vals match
|
50
|
+
if w == a
|
51
|
+
outarr.push(v[mapto])
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
return outarr
|
60
|
+
end
|
61
|
+
|
62
|
+
# Figure out which type of input it is: array, hash, hash with hash values
|
63
|
+
def detecttype
|
64
|
+
if @json.is_a? Array
|
65
|
+
@output = @json
|
66
|
+
checkCase
|
67
|
+
elsif @json.is_a? Hash
|
68
|
+
@json.each do |k, v|
|
69
|
+
if v.is_a? Hash
|
70
|
+
parseValHash
|
71
|
+
break
|
72
|
+
else
|
73
|
+
parseDictionary
|
74
|
+
break
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
return @outhash
|
80
|
+
end
|
81
|
+
|
82
|
+
# Adds case sensitive preferences
|
83
|
+
def checkCase
|
84
|
+
if @caseinfo == "casesensitive"
|
85
|
+
@output.each do |i|
|
86
|
+
@outhash[i] = true
|
87
|
+
end
|
88
|
+
elsif @caseinfo == "noncasesensitive"
|
89
|
+
@output.each do |i|
|
90
|
+
@outhash[i] = false
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
# Handle hashes where the values are a hash
|
96
|
+
def parseValHash
|
97
|
+
@json.each do |k, v|
|
98
|
+
if !@ignorefields.include? "hashkey"
|
99
|
+
if @caseinfo.include? "hashkey"
|
100
|
+
@outhash[k] = false
|
101
|
+
else
|
102
|
+
@outhash[k] = true
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
v.each do |i, j|
|
107
|
+
if !@ignorefields.include? i
|
108
|
+
if @caseinfo.include? i
|
109
|
+
@outhash[j] = false
|
110
|
+
else
|
111
|
+
@outhash[j] = true
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
# Handle hashes
|
119
|
+
def parseDictionary
|
120
|
+
@json.each do |k, v|
|
121
|
+
if !@ignorefields.include? "hashkey"
|
122
|
+
if @caseinfo.include? "hashkey"
|
123
|
+
@outhash[k] = false
|
124
|
+
else
|
125
|
+
@outhash[k] = true
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
if !@ignorefields.include? "hashval"
|
130
|
+
if @caseinfo.include? "hashval"
|
131
|
+
@outhash[v] = false
|
132
|
+
else
|
133
|
+
@outhash[v] = true
|
134
|
+
end
|
135
|
+
end
|
136
|
+
end
|
137
|
+
end
|
138
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: entityextractor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.11
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- M. C. McGrath
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-08-11 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: Extracts entities and terms from any JSON.
|
14
14
|
email: shidash@shidash.com
|
@@ -17,6 +17,7 @@ extensions: []
|
|
17
17
|
extra_rdoc_files: []
|
18
18
|
files:
|
19
19
|
- lib/extractdates.rb
|
20
|
+
- lib/handleinput.rb
|
20
21
|
- lib/entityextractor.rb
|
21
22
|
homepage: https://github.com/Shidash/EntityExtractor
|
22
23
|
licenses:
|
@@ -43,3 +44,4 @@ signing_key:
|
|
43
44
|
specification_version: 4
|
44
45
|
summary: Extracts entities and terms
|
45
46
|
test_files: []
|
47
|
+
has_rdoc:
|