entityextractor 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. data/lib/entityextractor.rb +109 -0
  2. metadata +47 -0
@@ -0,0 +1,109 @@
1
+ require 'json'
2
+
3
+ class EntityExtractor
4
+ def initialize(input, extractfield)
5
+ @input = JSON.parse(input)
6
+ @extractfield = extractfield
7
+ @output = Array.new
8
+ end
9
+
10
+ # Extract terms input from preset list
11
+ def extractTerms(*terms)
12
+ @input.each do |i|
13
+ addlist = Array.new
14
+ count = 0
15
+
16
+ # Check the item for each term
17
+ terms.each do |t|
18
+ count+=1
19
+ if i[@extractfield].to_s.include? t
20
+ addlist.push(t)
21
+
22
+ # Add found terms to output on last term
23
+ if count == terms.length
24
+ i["extract"] = addlist
25
+ @output.push(i)
26
+ end
27
+
28
+ elsif count == terms.length
29
+ i["extract"] = addlist
30
+ @output.push(i)
31
+ end
32
+ end
33
+ end
34
+ end
35
+
36
+ # Extract all terms in ALLCAPS (specifiy min num CAPS chars in row)
37
+ def extractALLCAPS(minchar, ignoreterms)
38
+ @input.each do |i|
39
+ addlist = Array.new
40
+ parseALLCAPS(i[@extractfield].to_s, i, minchar, addlist, ignoreterms)
41
+ end
42
+ end
43
+
44
+ # Parses terms in all caps
45
+ def parseALLCAPS(toParse, i, minchar, addlist, ignoreterms)
46
+ if toParse =~ (/[A-Z]{#{minchar}}/)
47
+ index = toParse =~ (/[A-Z]{#{minchar}}/)
48
+ charnum = 0
49
+
50
+ # Find word in all caps
51
+ toParse.each_char do |c|
52
+ if charnum >= index
53
+ if toParse[c] == toParse[c].upcase && toParse[c] !~ (/[[:punct:]]/) && toParse[c] !~ (/[[:digit:]]/)
54
+ charnum += 1
55
+ else break
56
+ end
57
+ else
58
+ charnum += 1
59
+ end
60
+ end
61
+
62
+ # Remove any extra characters
63
+ if toParse[charnum-2] == " "
64
+ charnum = charnum-3
65
+ elsif toParse[charnum-1] == " "
66
+ charnum = charnum-2
67
+ else charnum = charnum-1
68
+ end
69
+
70
+ # Filter out terms in ignoreterms array
71
+ if !(ignoreterms.include? toParse[index..charnum])
72
+ addlist.push(toParse[index..charnum])
73
+ end
74
+
75
+ parsedstring = toParse[0..charnum]
76
+ toParse.slice! parsedstring
77
+ parseALLCAPS(toParse, i, minchar, addlist, ignoreterms)
78
+
79
+ # If there are no (more) results, append addlist to JSON
80
+ else
81
+ i["extract"] = addlist
82
+ @output.push(i)
83
+ end
84
+ end
85
+
86
+ # Get list of just extracted terms by occurrence
87
+ def getExtract
88
+ extracthash = Hash.new
89
+
90
+ # Generate hash of all extracted terms
91
+ @output.each do |i|
92
+ i["extract"].each do |e|
93
+ if extracthash.has_key? e
94
+ extracthash[e] += 1
95
+ else
96
+ extracthash[e] = 1
97
+ end
98
+ end
99
+ end
100
+
101
+ # Sort hash
102
+ return Hash[extracthash.sort_by { |k, v| v}]
103
+ end
104
+
105
+ # Generates JSON output
106
+ def genJSON
107
+ JSON.pretty_generate(@output)
108
+ end
109
+ end
metadata ADDED
@@ -0,0 +1,47 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: entityextractor
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - M. C. McGrath
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2014-02-23 00:00:00.000000000 Z
13
+ dependencies: []
14
+ description: Extracts entities and terms from any JSON.
15
+ email: shidash@shidash.com
16
+ executables: []
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - lib/entityextractor.rb
21
+ homepage: https://github.com/Shidash/EntityExtractor
22
+ licenses:
23
+ - GPL
24
+ post_install_message:
25
+ rdoc_options: []
26
+ require_paths:
27
+ - lib
28
+ required_ruby_version: !ruby/object:Gem::Requirement
29
+ none: false
30
+ requirements:
31
+ - - ! '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ required_rubygems_version: !ruby/object:Gem::Requirement
35
+ none: false
36
+ requirements:
37
+ - - ! '>='
38
+ - !ruby/object:Gem::Version
39
+ version: '0'
40
+ requirements: []
41
+ rubyforge_project:
42
+ rubygems_version: 1.8.23
43
+ signing_key:
44
+ specification_version: 3
45
+ summary: Extracts entities and terms
46
+ test_files: []
47
+ has_rdoc: