entityextractor 0.0.15 → 0.0.16

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 81be4e29ad4a46a33f156efb2b9f52cdc4801701
4
- data.tar.gz: ec4a0dcf08239c7b5430d56b352aea1bc4a74f8f
3
+ metadata.gz: 258653503d54eb66621aee8e606bd808b59f8c9f
4
+ data.tar.gz: 464f2250a12d1a7f486f3201a8701c9cef40d205
5
5
  SHA512:
6
- metadata.gz: 20fec5e9765264d945f6d5c2711cb588f4067c8bed0ea62b5b8b7af6651161c9bcfeaa276966bc0869167ca0410d10b6ec3c01a74eca431daa5049da10af2b0f
7
- data.tar.gz: 929e7f0ecddb66594f7b9b80f63c424c1e7356b02d7e62b684847789a034dcf1144b08feabc5399e309b703bcc338db7fda18a61c6f1cf526d88e58c251bbf53
6
+ metadata.gz: e9be2c981f89562784c674497c3ce55d0123066a339bcce7b7be3afa9068b1f59222bb27c81d5e6b542f38ae524df8b66b38581ae7637e311731071bc1ecefa9
7
+ data.tar.gz: d31f952542ed92dfcdc6e6d02df12fd1070a2c0d6188fa7af87a32ad0e70e6b52ec37ddcdb6c01906df88d7b489c00a48ad5d0ffbc1c5e8195232c8a43b5cf5c
@@ -0,0 +1,46 @@
1
+ require 'json'
2
+ load 'extract_set_terms.rb'
3
+
4
+ class EntityExtractor
5
+ def initialize(input, extract_from, save_field)
6
+ @input = JSON.parse(input)
7
+ @extract_from = extract_from
8
+ @save_field = save_field
9
+
10
+ @output = Array.new
11
+ end
12
+
13
+ # Extracts set terms
14
+ def extractSetTerms(to_extract, extract_term_fields, case_sensitive)
15
+ @input.each do |item|
16
+ extract = ExtractSetTerms.new(item, @extract_from, to_extract, extract_term_fields, case_sensitive, @save_field)
17
+ @output.push(extract.extractTerms)
18
+ end
19
+ end
20
+
21
+ # Gets all results in output
22
+ def getAllOutput
23
+ JSON.pretty_generate(@output)
24
+ end
25
+
26
+ # Gets only the results for which terms were found/extracted
27
+ def getOnlyMatching
28
+ matches = @output.select { |item| !item[@save_field].empty? }
29
+ JSON.pretty_generate(matches)
30
+ end
31
+
32
+ # Gets a list of the extracted terms by how often they occur
33
+ def getTermList
34
+ counthash = Hash.new{0}
35
+
36
+ # Increments for each occurrence of term
37
+ @output.each do |item|
38
+ item[@save_field].each do |term|
39
+ counthash[term] += 1
40
+ end
41
+ end
42
+
43
+ # Return hash sorted by value
44
+ return Hash[counthash.sort_by { |k, v| v}]
45
+ end
46
+ end
@@ -0,0 +1,92 @@
1
+ class ExtractSetTerms
2
+ def initialize(item, extract_field, to_extract, extract_term_fields, case_sensitive, save_field)
3
+ @item = item
4
+ @extract_field = extract_field
5
+
6
+ @to_extract = JSON.parse(to_extract)
7
+ @extract_term_fields = extract_term_fields
8
+ @case_sensitive = case_sensitive
9
+
10
+ @extract_dict = Hash.new
11
+ @save_field = save_field
12
+ @item_out = item
13
+ end
14
+
15
+ # Gets a list of terms to extract
16
+ def processHashInput
17
+ # Go through each item then each field
18
+ @to_extract.each do |ex_key, ex_value|
19
+ ex_value.each do |ex_field, ex_term|
20
+
21
+ # Check if it is the right field
22
+ if ex_field == @extract_term_fields || @extract_term_fields.include?(ex_field)
23
+ # Make dictionary of terms to extract and overall mapping
24
+ ex_term.is_a?(Array) ? processArrayInput(ex_term, ex_key) : @extract_dict[term] = ex_key
25
+ end
26
+
27
+ end
28
+ end
29
+ end
30
+
31
+ # Add all items in array to dictionary of terms to extract
32
+ def processArrayInput(extract_arr, map_val)
33
+ extract_arr.each do |term|
34
+ map_val = term if map_val == nil
35
+ @extract_dict[term] = map_val
36
+ end
37
+ end
38
+
39
+ # Check if the term appears in the text
40
+ def matchTerm?(term, text, case_sensitive)
41
+ # Downcase term and text if not case sensitive
42
+ if case_sensitive == false
43
+ term = term.downcase
44
+ text = text.downcase
45
+ end
46
+
47
+ # Return if it maches
48
+ if text.to_s.match(/\b(#{term})\b/)
49
+ return true
50
+ end
51
+ end
52
+
53
+ # Check if item is case sensitive
54
+ def isCaseSensitive?(term)
55
+ if @case_sensitive == "case-sensitive"
56
+ return true
57
+ elsif @case_sensitive == "case-insensitive"
58
+ return false
59
+ else
60
+ # Handle item by item variations
61
+ is_case_sensitive = @to_extract[term[1]][@case_sensitive]
62
+ if is_case_sensitive == "Yes"
63
+ return true
64
+ else return false
65
+ end
66
+ end
67
+ end
68
+
69
+ # Process input list and go through all terms and fields
70
+ def extractTerms
71
+ # Process input list
72
+ @to_extract.is_a?(Hash) ? processHashInput : processArrayInput(@to_extract, nil)
73
+ @item_out[@save_field] = Array.new
74
+
75
+ # Go through each term and field to check for matches
76
+ @extract_dict.each do |term|
77
+ item_case_sensitivity = isCaseSensitive?(term)
78
+ @extract_field.each do |field|
79
+
80
+ # Add to list of terms if it matches
81
+ if matchTerm?(term[0], @item[field], item_case_sensitivity)
82
+ @item_out[@save_field].push(term[1])
83
+ end
84
+
85
+ end
86
+ end
87
+
88
+ # Deduplicate and return
89
+ @item_out[@save_field].uniq!
90
+ return @item_out
91
+ end
92
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: entityextractor
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.15
4
+ version: 0.0.16
5
5
  platform: ruby
6
6
  authors:
7
7
  - M. C. McGrath
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-03-14 00:00:00.000000000 Z
11
+ date: 2015-04-20 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Extracts entities and terms from any JSON.
14
14
  email: shidash@shidash.com
@@ -16,9 +16,8 @@ executables: []
16
16
  extensions: []
17
17
  extra_rdoc_files: []
18
18
  files:
19
- - lib/entityextractor.rb
20
- - lib/extractdates.rb
21
- - lib/handleinput.rb
19
+ - lib/entity_extractor.rb
20
+ - lib/extract_set_terms.rb
22
21
  homepage: https://github.com/Shidash/EntityExtractor
23
22
  licenses:
24
23
  - GPL
@@ -1,173 +0,0 @@
1
- require 'json'
2
- load 'extractdates.rb'
3
- load 'handleinput.rb'
4
- require 'uploadconvert'
5
-
6
- class EntityExtractor
7
- def initialize(input, fieldoutname, *extractfield)
8
- @input = JSON.parse(input)
9
- @fieldoutname = fieldoutname
10
- @extractfield = *extractfield
11
- @output = Array.new
12
- end
13
-
14
- # Extract terms input from preset list
15
- def extractTerms(extractlist, i, addlist, field)
16
- count = 0
17
- downcased = i[field].to_s.downcase
18
-
19
- # Check the item for each term
20
- extractlist.each do |t, c|
21
- count+=1
22
- if c == true
23
- if i[field].to_s.match(/\b(#{t})\b/)
24
- addlist.push(t) if !addlist.include? t
25
- end
26
- else
27
- if downcased.match(/\b(#{t.downcase})\b/)
28
- addlist.push(t) if !addlist.include? t
29
- end
30
- end
31
- end
32
- end
33
-
34
- # Parses terms in all caps
35
- def parseALLCAPS(toParse, i, minchar, addlist, ignoreterms, savefield, extractfield)
36
- if toParse =~ (/[A-Z]{#{minchar}}/)
37
- index = toParse =~ (/[A-Z]{#{minchar}}/)
38
- charnum = 0
39
-
40
- # Find word in all caps
41
- toParse.each_char do |c|
42
- if charnum >= index
43
- if toParse[c] == toParse[c].upcase && toParse[c] !~ (/[[:punct:]]/) && toParse[c] !~ (/[[:digit:]]/)
44
- charnum += 1
45
- else break
46
- end
47
- else
48
- charnum += 1
49
- end
50
- end
51
-
52
- # Remove any extra characters
53
- if toParse[charnum-2] == " "
54
- charnum = charnum-3
55
- elsif toParse[charnum-1] == " "
56
- charnum = charnum-2
57
- else charnum = charnum-1
58
- end
59
-
60
- # Filter out terms in ignoreterms array
61
- if !(ignoreterms.include? toParse[index..charnum])
62
- addlist.push(toParse[index..charnum])
63
- end
64
-
65
- parsedstring = toParse[0..charnum]
66
- toParse.slice! parsedstring
67
- parseALLCAPS(toParse, i, minchar, addlist, ignoreterms, savefield, extractfield)
68
-
69
- # If there are no (more) results, append addlist to JSON
70
- else
71
- i[extractfield] = savefield
72
- end
73
- end
74
-
75
- # Get list of just extracted terms by occurrence
76
- def getExtract
77
- extracthash = Hash.new
78
-
79
- # Generate hash of all extracted terms
80
- @output.each do |i|
81
- i[@fieldoutname].each do |e|
82
- if extracthash.has_key? e
83
- extracthash[e] += 1
84
- else
85
- extracthash[e] = 1
86
- end
87
- end
88
- end
89
-
90
- # Sort hash
91
- return Hash[extracthash.sort_by { |k, v| v}]
92
- end
93
-
94
- # Generates JSON output
95
- def genJSON
96
- JSON.pretty_generate(@output)
97
- end
98
-
99
- def extract(type, minchar, ignoreterms, terms, ignorefields, caseinfo, mapto, *append)
100
- flag = 0
101
-
102
- h = HandleInput.new(terms, ignorefields, caseinfo)
103
- extractlist = h.detecttype
104
-
105
- @input.each do |i|
106
- if i.length == 2
107
- i = @input
108
- flag = 1
109
- end
110
-
111
- addlist = Array.new
112
-
113
- # Generate set terms list
114
- if type == "set"
115
- @extractfield.each do |f|
116
- extractTerms(extractlist, i, addlist, f)
117
- end
118
-
119
- if mapto
120
- i[@fieldoutname] = h.mapout(addlist, mapto)
121
- else
122
- i[@fieldoutname] = addlist
123
- end
124
- @output.push(i)
125
-
126
- # Generate ALLCAPS terms list
127
- elsif type == "ALLCAPS"
128
- @extractfield.each do |f|
129
- savefield = i[f].to_s + " "
130
- parseALLCAPS(i[f].to_s, i, minchar, addlist, ignoreterms, savefield, f)
131
- end
132
-
133
- i[@fieldoutname] = addlist
134
- @output.push(i)
135
-
136
- # Extract dates
137
- elsif type == "date"
138
- @extractfield.each do |f|
139
- d = ExtractDates.new(i[f])
140
-
141
- appendhash = Hash.new
142
- append.each do |a|
143
- appendhash[a] = i[a]
144
- end
145
-
146
- outhash = d.chunk(appendhash)
147
- @output.push(outhash) if !outhash.empty?
148
- end
149
-
150
- # Extract both set terms and ALLCAPS
151
- elsif type == "both"
152
- @extractfield.each do |f|
153
- savefield = i[f].to_s + " "
154
- parseALLCAPS(i[f].to_s, i, minchar, addlist, ignoreterms, savefield, f)
155
- extractTerms(extractlist, i, addlist, f)
156
- end
157
-
158
- if mapto
159
- i[@fieldoutname] = h.mapout(addlist, mapto)
160
- else
161
- i[@fieldoutname] = addlist
162
- end
163
-
164
- @output.push(i)
165
- end
166
-
167
- if flag == 1
168
- break
169
- end
170
- end
171
- end
172
- end
173
-
@@ -1,97 +0,0 @@
1
- require 'treat'
2
- include Treat::Core::DSL
3
- require 'date'
4
- require 'json'
5
- require 'american_date'
6
-
7
- class ExtractDates
8
- def initialize(text)
9
- @text = text
10
- @output = Array.new
11
- end
12
-
13
- def chunk(append)
14
- if !@text.empty?
15
- i = @text
16
- s = paragraph(i).segment
17
- s.each do |j|
18
- dateExtract(j, append, j, i)
19
- end
20
- end
21
-
22
- return @output
23
- end
24
-
25
- # Finds matches for date formats in the blob from chunk(append)
26
- def dateExtract(blob, append, title, description)
27
- blobstring = blob.to_s
28
-
29
- begin
30
- # See below, but with yyyy-mm-dd (and months can only start with 0-1
31
- if blobstring.match(/(?:19|20)\d{2}(?:-|\/)[0-1]?[0-9](?:-|\/)[0-3]?[0-9]/)
32
- save = Regexp.last_match.to_s
33
- saveparse = save.gsub("-", "/") # Needed for american_date gem
34
- addItem(Date.parse(saveparse).to_s, append, title, description, blobstring, save)
35
-
36
- # mm-dd-yyyy, mm/dd/yy, and similar. m or d must start with 0-3 (optional) and end in not 0
37
- # Year can only start with 19 or 20 if it is four chars, or it could be 2 char
38
- elsif blobstring.match(/[0-3]?[0-9](?:-|\/)[0-3]?[0-9](?:-|\/)(?:(?:19|20)\d{2}|\d{2})/)
39
- save = Regexp.last_match.to_s
40
- saveparse = save.gsub("-", "/")
41
- addItem(Date.parse(saveparse).to_s, append, title, description, blobstring, save)
42
-
43
- # Same as below but with dd before instead of in middle and supports two digit year
44
- # Matches: dd Month yyyy, ddth Month yyyy, ddmonthyy ddthmonthyyyy
45
- elsif blobstring.match(/(?:(?:[0-3]?[0-9])(?:st|nd|rd|th)?) *(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Nov(?:ember)?|Dec(?:ember)?) *(?:19|20)?\d{2}/i)
46
- save = Regexp.last_match.to_s
47
- addItem(Date.parse(save).to_s, append, title, description, blobstring, save)
48
-
49
- # Matches: Month yyyy, Month dd yyyy, Month ddth yyyy, Month dd, yyyy, Month ddth, yyyy,
50
- # Month can be full month or abbreviation, optional dd with 1 optional th/st/nd/rd, yyyy starting in 19 or 20
51
- # Case insensitive, optional/variable spaces
52
- elsif blobstring.match(/(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Nov(?:ember)?|Dec(?:ember)?) *(?:(?:[0-3]?[0-9])(?:st|nd|rd|th)?)?,? *(?:19|20)\d{2}/i)
53
- save = Regexp.last_match.to_s
54
- addItem(Date.parse(save).to_s, append, title, description, blobstring, save)
55
-
56
- # Matches: yyyy
57
- # Must start and end with word boundry, year must start with 19 or 20 and be 4 numbers
58
- elsif blobstring.match(/\b(?:19|20)\d{2}\b/)
59
- save = Regexp.last_match.to_s
60
- addItem(Date.parse(Date.new(save.to_i).to_s), append, title, description, blobstring, save)
61
- end
62
-
63
- rescue
64
- end
65
- end
66
-
67
- # Adds and item to the hash
68
- def addItem(date, append, title, description, blob, regex)
69
- shash = Hash.new
70
- shash[:parsed_date] = date
71
- shash[:raw_date] = regex
72
- shash[:short_chunk] = title
73
-
74
- # Append fields specified
75
- unless append == {nil=>nil}
76
- append.each do |k, v|
77
- shash[k] = v
78
- end
79
- end
80
-
81
- flag = 0
82
- @output.each do |o|
83
- if (o[:parsed_date] == shash[:parsed_date]) && (o[:short_chunk].to_s == shash[:short_chunk].to_s)
84
- flag = 1
85
- break
86
- end
87
- end
88
-
89
- if flag == 0
90
- @output.push(shash)
91
- end
92
-
93
- blob.slice! regex
94
- dateExtract(blob, append, title, description)
95
- end
96
- end
97
-
@@ -1,148 +0,0 @@
1
- require 'json'
2
-
3
- class HandleInput
4
- def initialize(json, ignorefields, caseinfo)
5
- @json = json
6
- if ignorefields
7
- @ignorefields = ignorefields
8
- else
9
- @ignorefields = Array.new
10
- end
11
-
12
- @caseinfo = caseinfo
13
- @output = Array.new
14
- @outhash = Hash.new
15
- end
16
-
17
- # Map output to value
18
- def mapout(addlist, mapto)
19
- outarr = Array.new
20
-
21
- addlist.each do |a|
22
- if mapto == "key"
23
- @json.each do |k, v|
24
- # If it's a nested hash
25
- if v.is_a? Hash
26
- # Go through all values
27
- v.each do |z, w|
28
- # Check if k is already included
29
- if !outarr.include? k
30
- if w == a
31
- outarr.push(k)
32
- end
33
- end
34
- end
35
- else
36
- # Map for dictionaries
37
- if !outarr.include? k
38
- if v == a || k == a
39
- outarr.push(k)
40
- end
41
- end
42
- end
43
- end
44
- elsif mapto == "value"
45
- @json.each do |k, v|
46
- if !v.is_a? Hash
47
- if !outarr.include? v
48
- if k == a || v == a
49
- outarr.push(v)
50
- end
51
- end
52
- end
53
- end
54
- else
55
- @json.each do |k, v|
56
- v.each do |z, w|
57
- # Only map if not already matched
58
- if !outarr.include? v[mapto]
59
- # Check if vals match
60
- if w == a
61
- outarr.push(v[mapto])
62
- end
63
- end
64
- end
65
- end
66
- end
67
- end
68
-
69
- return outarr
70
- end
71
-
72
- # Figure out which type of input it is: array, hash, hash with hash values
73
- def detecttype
74
- if @json.is_a? Array
75
- @output = @json
76
- checkCase
77
- elsif @json.is_a? Hash
78
- @json.each do |k, v|
79
- if v.is_a? Hash
80
- parseValHash
81
- break
82
- else
83
- parseDictionary
84
- break
85
- end
86
- end
87
- end
88
-
89
- return @outhash
90
- end
91
-
92
- # Adds case sensitive preferences
93
- def checkCase
94
- if @caseinfo == "casesensitive"
95
- @output.each do |i|
96
- @outhash[i] = true
97
- end
98
- elsif @caseinfo == "noncasesensitive"
99
- @output.each do |i|
100
- @outhash[i] = false
101
- end
102
- end
103
- end
104
-
105
- # Handle hashes where the values are a hash
106
- def parseValHash
107
- @json.each do |k, v|
108
- if !@ignorefields.include? "hashkey"
109
- if @caseinfo.include? "hashkey"
110
- @outhash[k] = false
111
- else
112
- @outhash[k] = true
113
- end
114
- end
115
-
116
- v.each do |i, j|
117
- if !@ignorefields.include? i
118
- if @caseinfo.include? i
119
- @outhash[j] = false
120
- else
121
- @outhash[j] = true
122
- end
123
- end
124
- end
125
- end
126
- end
127
-
128
- # Handle hashes
129
- def parseDictionary
130
- @json.each do |k, v|
131
- if !@ignorefields.include? "hashkey"
132
- if @caseinfo.include? "hashkey"
133
- @outhash[k] = false
134
- else
135
- @outhash[k] = true
136
- end
137
- end
138
-
139
- if !@ignorefields.include? "hashval"
140
- if @caseinfo.include? "hashval"
141
- @outhash[v] = false
142
- else
143
- @outhash[v] = true
144
- end
145
- end
146
- end
147
- end
148
- end