entityextractor 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/entityextractor.rb +109 -0
- metadata +47 -0
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
require 'json'
|
|
2
|
+
|
|
3
|
+
class EntityExtractor
|
|
4
|
+
def initialize(input, extractfield)
|
|
5
|
+
@input = JSON.parse(input)
|
|
6
|
+
@extractfield = extractfield
|
|
7
|
+
@output = Array.new
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
# Extract terms input from preset list
|
|
11
|
+
def extractTerms(*terms)
|
|
12
|
+
@input.each do |i|
|
|
13
|
+
addlist = Array.new
|
|
14
|
+
count = 0
|
|
15
|
+
|
|
16
|
+
# Check the item for each term
|
|
17
|
+
terms.each do |t|
|
|
18
|
+
count+=1
|
|
19
|
+
if i[@extractfield].to_s.include? t
|
|
20
|
+
addlist.push(t)
|
|
21
|
+
|
|
22
|
+
# Add found terms to output on last term
|
|
23
|
+
if count == terms.length
|
|
24
|
+
i["extract"] = addlist
|
|
25
|
+
@output.push(i)
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
elsif count == terms.length
|
|
29
|
+
i["extract"] = addlist
|
|
30
|
+
@output.push(i)
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
# Extract all terms in ALLCAPS (specifiy min num CAPS chars in row)
|
|
37
|
+
def extractALLCAPS(minchar, ignoreterms)
|
|
38
|
+
@input.each do |i|
|
|
39
|
+
addlist = Array.new
|
|
40
|
+
parseALLCAPS(i[@extractfield].to_s, i, minchar, addlist, ignoreterms)
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
# Parses terms in all caps
|
|
45
|
+
def parseALLCAPS(toParse, i, minchar, addlist, ignoreterms)
|
|
46
|
+
if toParse =~ (/[A-Z]{#{minchar}}/)
|
|
47
|
+
index = toParse =~ (/[A-Z]{#{minchar}}/)
|
|
48
|
+
charnum = 0
|
|
49
|
+
|
|
50
|
+
# Find word in all caps
|
|
51
|
+
toParse.each_char do |c|
|
|
52
|
+
if charnum >= index
|
|
53
|
+
if toParse[c] == toParse[c].upcase && toParse[c] !~ (/[[:punct:]]/) && toParse[c] !~ (/[[:digit:]]/)
|
|
54
|
+
charnum += 1
|
|
55
|
+
else break
|
|
56
|
+
end
|
|
57
|
+
else
|
|
58
|
+
charnum += 1
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
# Remove any extra characters
|
|
63
|
+
if toParse[charnum-2] == " "
|
|
64
|
+
charnum = charnum-3
|
|
65
|
+
elsif toParse[charnum-1] == " "
|
|
66
|
+
charnum = charnum-2
|
|
67
|
+
else charnum = charnum-1
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
# Filter out terms in ignoreterms array
|
|
71
|
+
if !(ignoreterms.include? toParse[index..charnum])
|
|
72
|
+
addlist.push(toParse[index..charnum])
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
parsedstring = toParse[0..charnum]
|
|
76
|
+
toParse.slice! parsedstring
|
|
77
|
+
parseALLCAPS(toParse, i, minchar, addlist, ignoreterms)
|
|
78
|
+
|
|
79
|
+
# If there are no (more) results, append addlist to JSON
|
|
80
|
+
else
|
|
81
|
+
i["extract"] = addlist
|
|
82
|
+
@output.push(i)
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
# Get list of just extracted terms by occurrence
|
|
87
|
+
def getExtract
|
|
88
|
+
extracthash = Hash.new
|
|
89
|
+
|
|
90
|
+
# Generate hash of all extracted terms
|
|
91
|
+
@output.each do |i|
|
|
92
|
+
i["extract"].each do |e|
|
|
93
|
+
if extracthash.has_key? e
|
|
94
|
+
extracthash[e] += 1
|
|
95
|
+
else
|
|
96
|
+
extracthash[e] = 1
|
|
97
|
+
end
|
|
98
|
+
end
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
# Sort hash
|
|
102
|
+
return Hash[extracthash.sort_by { |k, v| v}]
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
# Generates JSON output
|
|
106
|
+
def genJSON
|
|
107
|
+
JSON.pretty_generate(@output)
|
|
108
|
+
end
|
|
109
|
+
end
|
metadata
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: entityextractor
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 0.0.1
|
|
5
|
+
prerelease:
|
|
6
|
+
platform: ruby
|
|
7
|
+
authors:
|
|
8
|
+
- M. C. McGrath
|
|
9
|
+
autorequire:
|
|
10
|
+
bindir: bin
|
|
11
|
+
cert_chain: []
|
|
12
|
+
date: 2014-02-23 00:00:00.000000000 Z
|
|
13
|
+
dependencies: []
|
|
14
|
+
description: Extracts entities and terms from any JSON.
|
|
15
|
+
email: shidash@shidash.com
|
|
16
|
+
executables: []
|
|
17
|
+
extensions: []
|
|
18
|
+
extra_rdoc_files: []
|
|
19
|
+
files:
|
|
20
|
+
- lib/entityextractor.rb
|
|
21
|
+
homepage: https://github.com/Shidash/EntityExtractor
|
|
22
|
+
licenses:
|
|
23
|
+
- GPL
|
|
24
|
+
post_install_message:
|
|
25
|
+
rdoc_options: []
|
|
26
|
+
require_paths:
|
|
27
|
+
- lib
|
|
28
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
29
|
+
none: false
|
|
30
|
+
requirements:
|
|
31
|
+
- - ! '>='
|
|
32
|
+
- !ruby/object:Gem::Version
|
|
33
|
+
version: '0'
|
|
34
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
35
|
+
none: false
|
|
36
|
+
requirements:
|
|
37
|
+
- - ! '>='
|
|
38
|
+
- !ruby/object:Gem::Version
|
|
39
|
+
version: '0'
|
|
40
|
+
requirements: []
|
|
41
|
+
rubyforge_project:
|
|
42
|
+
rubygems_version: 1.8.23
|
|
43
|
+
signing_key:
|
|
44
|
+
specification_version: 3
|
|
45
|
+
summary: Extracts entities and terms
|
|
46
|
+
test_files: []
|
|
47
|
+
has_rdoc:
|