entityextractor 0.0.2 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/entityextractor.rb +49 -36
- metadata +2 -2
data/lib/entityextractor.rb
CHANGED
@@ -1,49 +1,27 @@
|
|
1
1
|
require 'json'
|
2
2
|
|
3
3
|
class EntityExtractor
|
4
|
-
def initialize(input, extractfield)
|
4
|
+
def initialize(input, *extractfield)
|
5
5
|
@input = JSON.parse(input)
|
6
|
-
@extractfield = extractfield
|
6
|
+
@extractfield = *extractfield
|
7
7
|
@output = Array.new
|
8
8
|
end
|
9
9
|
|
10
10
|
# Extract terms input from preset list
|
11
|
-
def extractTerms(*terms)
|
12
|
-
|
13
|
-
addlist = Array.new
|
14
|
-
count = 0
|
11
|
+
def extractTerms(*terms, i, addlist, field)
|
12
|
+
count = 0
|
15
13
|
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
# Add found terms to output on last term
|
23
|
-
if count == terms.length
|
24
|
-
i["extract"] = addlist
|
25
|
-
@output.push(i)
|
26
|
-
end
|
27
|
-
|
28
|
-
elsif count == terms.length
|
29
|
-
i["extract"] = addlist
|
30
|
-
@output.push(i)
|
31
|
-
end
|
14
|
+
# Check the item for each term
|
15
|
+
terms.each do |t|
|
16
|
+
count+=1
|
17
|
+
if i[field].to_s.include? t
|
18
|
+
addlist.push(t)
|
32
19
|
end
|
33
20
|
end
|
34
21
|
end
|
35
22
|
|
36
|
-
# Extract all terms in ALLCAPS (specifiy min num CAPS chars in row)
|
37
|
-
def extractALLCAPS(minchar, ignoreterms)
|
38
|
-
@input.each do |i|
|
39
|
-
addlist = Array.new
|
40
|
-
savefield = i[@extractfield].to_s + " "
|
41
|
-
parseALLCAPS(i[@extractfield].to_s, i, minchar, addlist, ignoreterms, savefield)
|
42
|
-
end
|
43
|
-
end
|
44
|
-
|
45
23
|
# Parses terms in all caps
|
46
|
-
def parseALLCAPS(toParse, i, minchar, addlist, ignoreterms, savefield)
|
24
|
+
def parseALLCAPS(toParse, i, minchar, addlist, ignoreterms, savefield, extractfield)
|
47
25
|
if toParse =~ (/[A-Z]{#{minchar}}/)
|
48
26
|
index = toParse =~ (/[A-Z]{#{minchar}}/)
|
49
27
|
charnum = 0
|
@@ -75,13 +53,11 @@ class EntityExtractor
|
|
75
53
|
|
76
54
|
parsedstring = toParse[0..charnum]
|
77
55
|
toParse.slice! parsedstring
|
78
|
-
parseALLCAPS(toParse, i, minchar, addlist, ignoreterms, savefield)
|
56
|
+
parseALLCAPS(toParse, i, minchar, addlist, ignoreterms, savefield, extractfield)
|
79
57
|
|
80
58
|
# If there are no (more) results, append addlist to JSON
|
81
59
|
else
|
82
|
-
i[
|
83
|
-
i[@extractfield] = savefield
|
84
|
-
@output.push(i)
|
60
|
+
i[extractfield] = savefield
|
85
61
|
end
|
86
62
|
end
|
87
63
|
|
@@ -108,4 +84,41 @@ class EntityExtractor
|
|
108
84
|
def genJSON
|
109
85
|
JSON.pretty_generate(@output)
|
110
86
|
end
|
87
|
+
|
88
|
+
def extract(type, minchar, ignoreterms, *terms)
|
89
|
+
@input.each do |i|
|
90
|
+
addlist = Array.new
|
91
|
+
|
92
|
+
# Generate set terms list
|
93
|
+
if type == "set"
|
94
|
+
@extractfield.each do |f|
|
95
|
+
extractTerms(*terms, i, addlist, f)
|
96
|
+
end
|
97
|
+
|
98
|
+
i["extract"] = addlist
|
99
|
+
@output.push(i)
|
100
|
+
|
101
|
+
# Generate ALLCAPS terms list
|
102
|
+
elsif type == "ALLCAPS"
|
103
|
+
@extractfield.each do |f|
|
104
|
+
savefield = i[f].to_s + " "
|
105
|
+
parseALLCAPS(i[f].to_s, i, minchar, addlist, ignoreterms, savefield, f)
|
106
|
+
end
|
107
|
+
|
108
|
+
i["extract"] = addlist
|
109
|
+
@output.push(i)
|
110
|
+
|
111
|
+
# Extract both set terms and ALLCAPS
|
112
|
+
elsif type == "both"
|
113
|
+
@extractfield.each do |f|
|
114
|
+
savefield = i[f].to_s + " "
|
115
|
+
parseALLCAPS(i[f].to_s, i, minchar, addlist, ignoreterms, savefield, f)
|
116
|
+
extractTerms(*terms, i, addlist, f)
|
117
|
+
end
|
118
|
+
|
119
|
+
i["extract"] = addlist
|
120
|
+
@output.push(i)
|
121
|
+
end
|
122
|
+
end
|
123
|
+
end
|
111
124
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: entityextractor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2014-
|
12
|
+
date: 2014-03-01 00:00:00.000000000 Z
|
13
13
|
dependencies: []
|
14
14
|
description: Extracts entities and terms from any JSON.
|
15
15
|
email: shidash@shidash.com
|