entityextractor 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (2) hide show
  1. data/lib/entityextractor.rb +109 -0
  2. metadata +47 -0
@@ -0,0 +1,109 @@
1
+ require 'json'
2
+
3
+ class EntityExtractor
4
+ def initialize(input, extractfield)
5
+ @input = JSON.parse(input)
6
+ @extractfield = extractfield
7
+ @output = Array.new
8
+ end
9
+
10
+ # Extract terms input from preset list
11
+ def extractTerms(*terms)
12
+ @input.each do |i|
13
+ addlist = Array.new
14
+ count = 0
15
+
16
+ # Check the item for each term
17
+ terms.each do |t|
18
+ count+=1
19
+ if i[@extractfield].to_s.include? t
20
+ addlist.push(t)
21
+
22
+ # Add found terms to output on last term
23
+ if count == terms.length
24
+ i["extract"] = addlist
25
+ @output.push(i)
26
+ end
27
+
28
+ elsif count == terms.length
29
+ i["extract"] = addlist
30
+ @output.push(i)
31
+ end
32
+ end
33
+ end
34
+ end
35
+
36
+ # Extract all terms in ALLCAPS (specifiy min num CAPS chars in row)
37
+ def extractALLCAPS(minchar, ignoreterms)
38
+ @input.each do |i|
39
+ addlist = Array.new
40
+ parseALLCAPS(i[@extractfield].to_s, i, minchar, addlist, ignoreterms)
41
+ end
42
+ end
43
+
44
+ # Parses terms in all caps
45
+ def parseALLCAPS(toParse, i, minchar, addlist, ignoreterms)
46
+ if toParse =~ (/[A-Z]{#{minchar}}/)
47
+ index = toParse =~ (/[A-Z]{#{minchar}}/)
48
+ charnum = 0
49
+
50
+ # Find word in all caps
51
+ toParse.each_char do |c|
52
+ if charnum >= index
53
+ if toParse[c] == toParse[c].upcase && toParse[c] !~ (/[[:punct:]]/) && toParse[c] !~ (/[[:digit:]]/)
54
+ charnum += 1
55
+ else break
56
+ end
57
+ else
58
+ charnum += 1
59
+ end
60
+ end
61
+
62
+ # Remove any extra characters
63
+ if toParse[charnum-2] == " "
64
+ charnum = charnum-3
65
+ elsif toParse[charnum-1] == " "
66
+ charnum = charnum-2
67
+ else charnum = charnum-1
68
+ end
69
+
70
+ # Filter out terms in ignoreterms array
71
+ if !(ignoreterms.include? toParse[index..charnum])
72
+ addlist.push(toParse[index..charnum])
73
+ end
74
+
75
+ parsedstring = toParse[0..charnum]
76
+ toParse.slice! parsedstring
77
+ parseALLCAPS(toParse, i, minchar, addlist, ignoreterms)
78
+
79
+ # If there are no (more) results, append addlist to JSON
80
+ else
81
+ i["extract"] = addlist
82
+ @output.push(i)
83
+ end
84
+ end
85
+
86
+ # Get list of just extracted terms by occurrence
87
+ def getExtract
88
+ extracthash = Hash.new
89
+
90
+ # Generate hash of all extracted terms
91
+ @output.each do |i|
92
+ i["extract"].each do |e|
93
+ if extracthash.has_key? e
94
+ extracthash[e] += 1
95
+ else
96
+ extracthash[e] = 1
97
+ end
98
+ end
99
+ end
100
+
101
+ # Sort hash
102
+ return Hash[extracthash.sort_by { |k, v| v}]
103
+ end
104
+
105
+ # Generates JSON output
106
+ def genJSON
107
+ JSON.pretty_generate(@output)
108
+ end
109
+ end
metadata ADDED
@@ -0,0 +1,47 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: entityextractor
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - M. C. McGrath
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2014-02-23 00:00:00.000000000 Z
13
+ dependencies: []
14
+ description: Extracts entities and terms from any JSON.
15
+ email: shidash@shidash.com
16
+ executables: []
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - lib/entityextractor.rb
21
+ homepage: https://github.com/Shidash/EntityExtractor
22
+ licenses:
23
+ - GPL
24
+ post_install_message:
25
+ rdoc_options: []
26
+ require_paths:
27
+ - lib
28
+ required_ruby_version: !ruby/object:Gem::Requirement
29
+ none: false
30
+ requirements:
31
+ - - ! '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ required_rubygems_version: !ruby/object:Gem::Requirement
35
+ none: false
36
+ requirements:
37
+ - - ! '>='
38
+ - !ruby/object:Gem::Version
39
+ version: '0'
40
+ requirements: []
41
+ rubyforge_project:
42
+ rubygems_version: 1.8.23
43
+ signing_key:
44
+ specification_version: 3
45
+ summary: Extracts entities and terms
46
+ test_files: []
47
+ has_rdoc: