entityextractor 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. data/lib/entityextractor.rb +49 -36
  2. metadata +2 -2
@@ -1,49 +1,27 @@
1
1
  require 'json'
2
2
 
3
3
  class EntityExtractor
4
- def initialize(input, extractfield)
4
+ def initialize(input, *extractfield)
5
5
  @input = JSON.parse(input)
6
- @extractfield = extractfield
6
+ @extractfield = *extractfield
7
7
  @output = Array.new
8
8
  end
9
9
 
10
10
  # Extract terms input from preset list
11
- def extractTerms(*terms)
12
- @input.each do |i|
13
- addlist = Array.new
14
- count = 0
11
+ def extractTerms(*terms, i, addlist, field)
12
+ count = 0
15
13
 
16
- # Check the item for each term
17
- terms.each do |t|
18
- count+=1
19
- if i[@extractfield].to_s.include? t
20
- addlist.push(t)
21
-
22
- # Add found terms to output on last term
23
- if count == terms.length
24
- i["extract"] = addlist
25
- @output.push(i)
26
- end
27
-
28
- elsif count == terms.length
29
- i["extract"] = addlist
30
- @output.push(i)
31
- end
14
+ # Check the item for each term
15
+ terms.each do |t|
16
+ count+=1
17
+ if i[field].to_s.include? t
18
+ addlist.push(t)
32
19
  end
33
20
  end
34
21
  end
35
22
 
36
- # Extract all terms in ALLCAPS (specifiy min num CAPS chars in row)
37
- def extractALLCAPS(minchar, ignoreterms)
38
- @input.each do |i|
39
- addlist = Array.new
40
- savefield = i[@extractfield].to_s + " "
41
- parseALLCAPS(i[@extractfield].to_s, i, minchar, addlist, ignoreterms, savefield)
42
- end
43
- end
44
-
45
23
  # Parses terms in all caps
46
- def parseALLCAPS(toParse, i, minchar, addlist, ignoreterms, savefield)
24
+ def parseALLCAPS(toParse, i, minchar, addlist, ignoreterms, savefield, extractfield)
47
25
  if toParse =~ (/[A-Z]{#{minchar}}/)
48
26
  index = toParse =~ (/[A-Z]{#{minchar}}/)
49
27
  charnum = 0
@@ -75,13 +53,11 @@ class EntityExtractor
75
53
 
76
54
  parsedstring = toParse[0..charnum]
77
55
  toParse.slice! parsedstring
78
- parseALLCAPS(toParse, i, minchar, addlist, ignoreterms, savefield)
56
+ parseALLCAPS(toParse, i, minchar, addlist, ignoreterms, savefield, extractfield)
79
57
 
80
58
  # If there are no (more) results, append addlist to JSON
81
59
  else
82
- i["extract"] = addlist
83
- i[@extractfield] = savefield
84
- @output.push(i)
60
+ i[extractfield] = savefield
85
61
  end
86
62
  end
87
63
 
@@ -108,4 +84,41 @@ class EntityExtractor
108
84
  def genJSON
109
85
  JSON.pretty_generate(@output)
110
86
  end
87
+
88
+ def extract(type, minchar, ignoreterms, *terms)
89
+ @input.each do |i|
90
+ addlist = Array.new
91
+
92
+ # Generate set terms list
93
+ if type == "set"
94
+ @extractfield.each do |f|
95
+ extractTerms(*terms, i, addlist, f)
96
+ end
97
+
98
+ i["extract"] = addlist
99
+ @output.push(i)
100
+
101
+ # Generate ALLCAPS terms list
102
+ elsif type == "ALLCAPS"
103
+ @extractfield.each do |f|
104
+ savefield = i[f].to_s + " "
105
+ parseALLCAPS(i[f].to_s, i, minchar, addlist, ignoreterms, savefield, f)
106
+ end
107
+
108
+ i["extract"] = addlist
109
+ @output.push(i)
110
+
111
+ # Extract both set terms and ALLCAPS
112
+ elsif type == "both"
113
+ @extractfield.each do |f|
114
+ savefield = i[f].to_s + " "
115
+ parseALLCAPS(i[f].to_s, i, minchar, addlist, ignoreterms, savefield, f)
116
+ extractTerms(*terms, i, addlist, f)
117
+ end
118
+
119
+ i["extract"] = addlist
120
+ @output.push(i)
121
+ end
122
+ end
123
+ end
111
124
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: entityextractor
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2014-02-26 00:00:00.000000000 Z
12
+ date: 2014-03-01 00:00:00.000000000 Z
13
13
  dependencies: []
14
14
  description: Extracts entities and terms from any JSON.
15
15
  email: shidash@shidash.com