entityextractor 0.0.15 → 0.0.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 81be4e29ad4a46a33f156efb2b9f52cdc4801701
4
- data.tar.gz: ec4a0dcf08239c7b5430d56b352aea1bc4a74f8f
3
+ metadata.gz: 258653503d54eb66621aee8e606bd808b59f8c9f
4
+ data.tar.gz: 464f2250a12d1a7f486f3201a8701c9cef40d205
5
5
  SHA512:
6
- metadata.gz: 20fec5e9765264d945f6d5c2711cb588f4067c8bed0ea62b5b8b7af6651161c9bcfeaa276966bc0869167ca0410d10b6ec3c01a74eca431daa5049da10af2b0f
7
- data.tar.gz: 929e7f0ecddb66594f7b9b80f63c424c1e7356b02d7e62b684847789a034dcf1144b08feabc5399e309b703bcc338db7fda18a61c6f1cf526d88e58c251bbf53
6
+ metadata.gz: e9be2c981f89562784c674497c3ce55d0123066a339bcce7b7be3afa9068b1f59222bb27c81d5e6b542f38ae524df8b66b38581ae7637e311731071bc1ecefa9
7
+ data.tar.gz: d31f952542ed92dfcdc6e6d02df12fd1070a2c0d6188fa7af87a32ad0e70e6b52ec37ddcdb6c01906df88d7b489c00a48ad5d0ffbc1c5e8195232c8a43b5cf5c
@@ -0,0 +1,46 @@
1
+ require 'json'
2
+ load 'extract_set_terms.rb'
3
+
4
+ class EntityExtractor
5
+ def initialize(input, extract_from, save_field)
6
+ @input = JSON.parse(input)
7
+ @extract_from = extract_from
8
+ @save_field = save_field
9
+
10
+ @output = Array.new
11
+ end
12
+
13
+ # Extracts set terms
14
+ def extractSetTerms(to_extract, extract_term_fields, case_sensitive)
15
+ @input.each do |item|
16
+ extract = ExtractSetTerms.new(item, @extract_from, to_extract, extract_term_fields, case_sensitive, @save_field)
17
+ @output.push(extract.extractTerms)
18
+ end
19
+ end
20
+
21
+ # Gets all results in output
22
+ def getAllOutput
23
+ JSON.pretty_generate(@output)
24
+ end
25
+
26
+ # Gets only the results for which terms were found/extracted
27
+ def getOnlyMatching
28
+ matches = @output.select { |item| !item[@save_field].empty? }
29
+ JSON.pretty_generate(matches)
30
+ end
31
+
32
+ # Gets a list of the extracted terms by how often they occur
33
+ def getTermList
34
+ counthash = Hash.new{0}
35
+
36
+ # Increments for each occurrence of term
37
+ @output.each do |item|
38
+ item[@save_field].each do |term|
39
+ counthash[term] += 1
40
+ end
41
+ end
42
+
43
+ # Return hash sorted by value
44
+ return Hash[counthash.sort_by { |k, v| v}]
45
+ end
46
+ end
@@ -0,0 +1,92 @@
1
+ class ExtractSetTerms
2
+ def initialize(item, extract_field, to_extract, extract_term_fields, case_sensitive, save_field)
3
+ @item = item
4
+ @extract_field = extract_field
5
+
6
+ @to_extract = JSON.parse(to_extract)
7
+ @extract_term_fields = extract_term_fields
8
+ @case_sensitive = case_sensitive
9
+
10
+ @extract_dict = Hash.new
11
+ @save_field = save_field
12
+ @item_out = item
13
+ end
14
+
15
+ # Gets a list of terms to extract
16
+ def processHashInput
17
+ # Go through each item then each field
18
+ @to_extract.each do |ex_key, ex_value|
19
+ ex_value.each do |ex_field, ex_term|
20
+
21
+ # Check if it is the right field
22
+ if ex_field == @extract_term_fields || @extract_term_fields.include?(ex_field)
23
+ # Make dictionary of terms to extract and overall mapping
24
+ ex_term.is_a?(Array) ? processArrayInput(ex_term, ex_key) : @extract_dict[term] = ex_key
25
+ end
26
+
27
+ end
28
+ end
29
+ end
30
+
31
+ # Add all items in array to dictionary of terms to extract
32
+ def processArrayInput(extract_arr, map_val)
33
+ extract_arr.each do |term|
34
+ map_val = term if map_val == nil
35
+ @extract_dict[term] = map_val
36
+ end
37
+ end
38
+
39
+ # Check if the term appears in the text
40
+ def matchTerm?(term, text, case_sensitive)
41
+ # Downcase term and text if not case sensitive
42
+ if case_sensitive == false
43
+ term = term.downcase
44
+ text = text.downcase
45
+ end
46
+
47
+ # Return if it maches
48
+ if text.to_s.match(/\b(#{term})\b/)
49
+ return true
50
+ end
51
+ end
52
+
53
+ # Check if item is case sensitive
54
+ def isCaseSensitive?(term)
55
+ if @case_sensitive == "case-sensitive"
56
+ return true
57
+ elsif @case_sensitive == "case-insensitive"
58
+ return false
59
+ else
60
+ # Handle item by item variations
61
+ is_case_sensitive = @to_extract[term[1]][@case_sensitive]
62
+ if is_case_sensitive == "Yes"
63
+ return true
64
+ else return false
65
+ end
66
+ end
67
+ end
68
+
69
+ # Process input list and go through all terms and fields
70
+ def extractTerms
71
+ # Process input list
72
+ @to_extract.is_a?(Hash) ? processHashInput : processArrayInput(@to_extract, nil)
73
+ @item_out[@save_field] = Array.new
74
+
75
+ # Go through each term and field to check for matches
76
+ @extract_dict.each do |term|
77
+ item_case_sensitivity = isCaseSensitive?(term)
78
+ @extract_field.each do |field|
79
+
80
+ # Add to list of terms if it matches
81
+ if matchTerm?(term[0], @item[field], item_case_sensitivity)
82
+ @item_out[@save_field].push(term[1])
83
+ end
84
+
85
+ end
86
+ end
87
+
88
+ # Deduplicate and return
89
+ @item_out[@save_field].uniq!
90
+ return @item_out
91
+ end
92
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: entityextractor
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.15
4
+ version: 0.0.16
5
5
  platform: ruby
6
6
  authors:
7
7
  - M. C. McGrath
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-03-14 00:00:00.000000000 Z
11
+ date: 2015-04-20 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Extracts entities and terms from any JSON.
14
14
  email: shidash@shidash.com
@@ -16,9 +16,8 @@ executables: []
16
16
  extensions: []
17
17
  extra_rdoc_files: []
18
18
  files:
19
- - lib/entityextractor.rb
20
- - lib/extractdates.rb
21
- - lib/handleinput.rb
19
+ - lib/entity_extractor.rb
20
+ - lib/extract_set_terms.rb
22
21
  homepage: https://github.com/Shidash/EntityExtractor
23
22
  licenses:
24
23
  - GPL
@@ -1,173 +0,0 @@
1
- require 'json'
2
- load 'extractdates.rb'
3
- load 'handleinput.rb'
4
- require 'uploadconvert'
5
-
6
- class EntityExtractor
7
- def initialize(input, fieldoutname, *extractfield)
8
- @input = JSON.parse(input)
9
- @fieldoutname = fieldoutname
10
- @extractfield = *extractfield
11
- @output = Array.new
12
- end
13
-
14
- # Extract terms input from preset list
15
- def extractTerms(extractlist, i, addlist, field)
16
- count = 0
17
- downcased = i[field].to_s.downcase
18
-
19
- # Check the item for each term
20
- extractlist.each do |t, c|
21
- count+=1
22
- if c == true
23
- if i[field].to_s.match(/\b(#{t})\b/)
24
- addlist.push(t) if !addlist.include? t
25
- end
26
- else
27
- if downcased.match(/\b(#{t.downcase})\b/)
28
- addlist.push(t) if !addlist.include? t
29
- end
30
- end
31
- end
32
- end
33
-
34
- # Parses terms in all caps
35
- def parseALLCAPS(toParse, i, minchar, addlist, ignoreterms, savefield, extractfield)
36
- if toParse =~ (/[A-Z]{#{minchar}}/)
37
- index = toParse =~ (/[A-Z]{#{minchar}}/)
38
- charnum = 0
39
-
40
- # Find word in all caps
41
- toParse.each_char do |c|
42
- if charnum >= index
43
- if toParse[c] == toParse[c].upcase && toParse[c] !~ (/[[:punct:]]/) && toParse[c] !~ (/[[:digit:]]/)
44
- charnum += 1
45
- else break
46
- end
47
- else
48
- charnum += 1
49
- end
50
- end
51
-
52
- # Remove any extra characters
53
- if toParse[charnum-2] == " "
54
- charnum = charnum-3
55
- elsif toParse[charnum-1] == " "
56
- charnum = charnum-2
57
- else charnum = charnum-1
58
- end
59
-
60
- # Filter out terms in ignoreterms array
61
- if !(ignoreterms.include? toParse[index..charnum])
62
- addlist.push(toParse[index..charnum])
63
- end
64
-
65
- parsedstring = toParse[0..charnum]
66
- toParse.slice! parsedstring
67
- parseALLCAPS(toParse, i, minchar, addlist, ignoreterms, savefield, extractfield)
68
-
69
- # If there are no (more) results, append addlist to JSON
70
- else
71
- i[extractfield] = savefield
72
- end
73
- end
74
-
75
- # Get list of just extracted terms by occurrence
76
- def getExtract
77
- extracthash = Hash.new
78
-
79
- # Generate hash of all extracted terms
80
- @output.each do |i|
81
- i[@fieldoutname].each do |e|
82
- if extracthash.has_key? e
83
- extracthash[e] += 1
84
- else
85
- extracthash[e] = 1
86
- end
87
- end
88
- end
89
-
90
- # Sort hash
91
- return Hash[extracthash.sort_by { |k, v| v}]
92
- end
93
-
94
- # Generates JSON output
95
- def genJSON
96
- JSON.pretty_generate(@output)
97
- end
98
-
99
- def extract(type, minchar, ignoreterms, terms, ignorefields, caseinfo, mapto, *append)
100
- flag = 0
101
-
102
- h = HandleInput.new(terms, ignorefields, caseinfo)
103
- extractlist = h.detecttype
104
-
105
- @input.each do |i|
106
- if i.length == 2
107
- i = @input
108
- flag = 1
109
- end
110
-
111
- addlist = Array.new
112
-
113
- # Generate set terms list
114
- if type == "set"
115
- @extractfield.each do |f|
116
- extractTerms(extractlist, i, addlist, f)
117
- end
118
-
119
- if mapto
120
- i[@fieldoutname] = h.mapout(addlist, mapto)
121
- else
122
- i[@fieldoutname] = addlist
123
- end
124
- @output.push(i)
125
-
126
- # Generate ALLCAPS terms list
127
- elsif type == "ALLCAPS"
128
- @extractfield.each do |f|
129
- savefield = i[f].to_s + " "
130
- parseALLCAPS(i[f].to_s, i, minchar, addlist, ignoreterms, savefield, f)
131
- end
132
-
133
- i[@fieldoutname] = addlist
134
- @output.push(i)
135
-
136
- # Extract dates
137
- elsif type == "date"
138
- @extractfield.each do |f|
139
- d = ExtractDates.new(i[f])
140
-
141
- appendhash = Hash.new
142
- append.each do |a|
143
- appendhash[a] = i[a]
144
- end
145
-
146
- outhash = d.chunk(appendhash)
147
- @output.push(outhash) if !outhash.empty?
148
- end
149
-
150
- # Extract both set terms and ALLCAPS
151
- elsif type == "both"
152
- @extractfield.each do |f|
153
- savefield = i[f].to_s + " "
154
- parseALLCAPS(i[f].to_s, i, minchar, addlist, ignoreterms, savefield, f)
155
- extractTerms(extractlist, i, addlist, f)
156
- end
157
-
158
- if mapto
159
- i[@fieldoutname] = h.mapout(addlist, mapto)
160
- else
161
- i[@fieldoutname] = addlist
162
- end
163
-
164
- @output.push(i)
165
- end
166
-
167
- if flag == 1
168
- break
169
- end
170
- end
171
- end
172
- end
173
-
@@ -1,97 +0,0 @@
1
- require 'treat'
2
- include Treat::Core::DSL
3
- require 'date'
4
- require 'json'
5
- require 'american_date'
6
-
7
- class ExtractDates
8
- def initialize(text)
9
- @text = text
10
- @output = Array.new
11
- end
12
-
13
- def chunk(append)
14
- if !@text.empty?
15
- i = @text
16
- s = paragraph(i).segment
17
- s.each do |j|
18
- dateExtract(j, append, j, i)
19
- end
20
- end
21
-
22
- return @output
23
- end
24
-
25
- # Finds matches for date formats in the blob from chunk(append)
26
- def dateExtract(blob, append, title, description)
27
- blobstring = blob.to_s
28
-
29
- begin
30
- # See below, but with yyyy-mm-dd (and months can only start with 0-1
31
- if blobstring.match(/(?:19|20)\d{2}(?:-|\/)[0-1]?[0-9](?:-|\/)[0-3]?[0-9]/)
32
- save = Regexp.last_match.to_s
33
- saveparse = save.gsub("-", "/") # Needed for american_date gem
34
- addItem(Date.parse(saveparse).to_s, append, title, description, blobstring, save)
35
-
36
- # mm-dd-yyyy, mm/dd/yy, and similar. m or d must start with 0-3 (optional) and end in not 0
37
- # Year can only start with 19 or 20 if it is four chars, or it could be 2 char
38
- elsif blobstring.match(/[0-3]?[0-9](?:-|\/)[0-3]?[0-9](?:-|\/)(?:(?:19|20)\d{2}|\d{2})/)
39
- save = Regexp.last_match.to_s
40
- saveparse = save.gsub("-", "/")
41
- addItem(Date.parse(saveparse).to_s, append, title, description, blobstring, save)
42
-
43
- # Same as below but with dd before instead of in middle and supports two digit year
44
- # Matches: dd Month yyyy, ddth Month yyyy, ddmonthyy ddthmonthyyyy
45
- elsif blobstring.match(/(?:(?:[0-3]?[0-9])(?:st|nd|rd|th)?) *(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Nov(?:ember)?|Dec(?:ember)?) *(?:19|20)?\d{2}/i)
46
- save = Regexp.last_match.to_s
47
- addItem(Date.parse(save).to_s, append, title, description, blobstring, save)
48
-
49
- # Matches: Month yyyy, Month dd yyyy, Month ddth yyyy, Month dd, yyyy, Month ddth, yyyy,
50
- # Month can be full month or abbreviation, optional dd with 1 optional th/st/nd/rd, yyyy starting in 19 or 20
51
- # Case insensitive, optional/variable spaces
52
- elsif blobstring.match(/(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Nov(?:ember)?|Dec(?:ember)?) *(?:(?:[0-3]?[0-9])(?:st|nd|rd|th)?)?,? *(?:19|20)\d{2}/i)
53
- save = Regexp.last_match.to_s
54
- addItem(Date.parse(save).to_s, append, title, description, blobstring, save)
55
-
56
- # Matches: yyyy
57
- # Must start and end with word boundry, year must start with 19 or 20 and be 4 numbers
58
- elsif blobstring.match(/\b(?:19|20)\d{2}\b/)
59
- save = Regexp.last_match.to_s
60
- addItem(Date.parse(Date.new(save.to_i).to_s), append, title, description, blobstring, save)
61
- end
62
-
63
- rescue
64
- end
65
- end
66
-
67
- # Adds and item to the hash
68
- def addItem(date, append, title, description, blob, regex)
69
- shash = Hash.new
70
- shash[:parsed_date] = date
71
- shash[:raw_date] = regex
72
- shash[:short_chunk] = title
73
-
74
- # Append fields specified
75
- unless append == {nil=>nil}
76
- append.each do |k, v|
77
- shash[k] = v
78
- end
79
- end
80
-
81
- flag = 0
82
- @output.each do |o|
83
- if (o[:parsed_date] == shash[:parsed_date]) && (o[:short_chunk].to_s == shash[:short_chunk].to_s)
84
- flag = 1
85
- break
86
- end
87
- end
88
-
89
- if flag == 0
90
- @output.push(shash)
91
- end
92
-
93
- blob.slice! regex
94
- dateExtract(blob, append, title, description)
95
- end
96
- end
97
-
@@ -1,148 +0,0 @@
1
- require 'json'
2
-
3
- class HandleInput
4
- def initialize(json, ignorefields, caseinfo)
5
- @json = json
6
- if ignorefields
7
- @ignorefields = ignorefields
8
- else
9
- @ignorefields = Array.new
10
- end
11
-
12
- @caseinfo = caseinfo
13
- @output = Array.new
14
- @outhash = Hash.new
15
- end
16
-
17
- # Map output to value
18
- def mapout(addlist, mapto)
19
- outarr = Array.new
20
-
21
- addlist.each do |a|
22
- if mapto == "key"
23
- @json.each do |k, v|
24
- # If it's a nested hash
25
- if v.is_a? Hash
26
- # Go through all values
27
- v.each do |z, w|
28
- # Check if k is already included
29
- if !outarr.include? k
30
- if w == a
31
- outarr.push(k)
32
- end
33
- end
34
- end
35
- else
36
- # Map for dictionaries
37
- if !outarr.include? k
38
- if v == a || k == a
39
- outarr.push(k)
40
- end
41
- end
42
- end
43
- end
44
- elsif mapto == "value"
45
- @json.each do |k, v|
46
- if !v.is_a? Hash
47
- if !outarr.include? v
48
- if k == a || v == a
49
- outarr.push(v)
50
- end
51
- end
52
- end
53
- end
54
- else
55
- @json.each do |k, v|
56
- v.each do |z, w|
57
- # Only map if not already matched
58
- if !outarr.include? v[mapto]
59
- # Check if vals match
60
- if w == a
61
- outarr.push(v[mapto])
62
- end
63
- end
64
- end
65
- end
66
- end
67
- end
68
-
69
- return outarr
70
- end
71
-
72
- # Figure out which type of input it is: array, hash, hash with hash values
73
- def detecttype
74
- if @json.is_a? Array
75
- @output = @json
76
- checkCase
77
- elsif @json.is_a? Hash
78
- @json.each do |k, v|
79
- if v.is_a? Hash
80
- parseValHash
81
- break
82
- else
83
- parseDictionary
84
- break
85
- end
86
- end
87
- end
88
-
89
- return @outhash
90
- end
91
-
92
- # Adds case sensitive preferences
93
- def checkCase
94
- if @caseinfo == "casesensitive"
95
- @output.each do |i|
96
- @outhash[i] = true
97
- end
98
- elsif @caseinfo == "noncasesensitive"
99
- @output.each do |i|
100
- @outhash[i] = false
101
- end
102
- end
103
- end
104
-
105
- # Handle hashes where the values are a hash
106
- def parseValHash
107
- @json.each do |k, v|
108
- if !@ignorefields.include? "hashkey"
109
- if @caseinfo.include? "hashkey"
110
- @outhash[k] = false
111
- else
112
- @outhash[k] = true
113
- end
114
- end
115
-
116
- v.each do |i, j|
117
- if !@ignorefields.include? i
118
- if @caseinfo.include? i
119
- @outhash[j] = false
120
- else
121
- @outhash[j] = true
122
- end
123
- end
124
- end
125
- end
126
- end
127
-
128
- # Handle hashes
129
- def parseDictionary
130
- @json.each do |k, v|
131
- if !@ignorefields.include? "hashkey"
132
- if @caseinfo.include? "hashkey"
133
- @outhash[k] = false
134
- else
135
- @outhash[k] = true
136
- end
137
- end
138
-
139
- if !@ignorefields.include? "hashval"
140
- if @caseinfo.include? "hashval"
141
- @outhash[v] = false
142
- else
143
- @outhash[v] = true
144
- end
145
- end
146
- end
147
- end
148
- end