termextractor 0.0.16

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: b96a8a2568c9b0f2bb0b17b07a6d0131b39ed0e5
4
+ data.tar.gz: 2f1612dfd0564eda138957fd50d0a6b058483b15
5
+ SHA512:
6
+ metadata.gz: 2b70ffec27aa30dc168f674175693bd1dd31f0ed044af7c10ea2a39591285a373768e8b8e85828e8e07f5ec38a2316763885178368f86b1acf1f3fa134610f9e
7
+ data.tar.gz: 0c0bb009fa834204d765965aca672b2f07cbef92f8821d3edae4aa1b58df73c7dd0f2bffb0fabe058794b58cf0afbb25175df46494a5880e88150298a5f9daf0
@@ -0,0 +1,92 @@
1
+ class ExtractSetTerms
2
+ def initialize(item, extract_field, to_extract, extract_term_fields, case_sensitive, save_field)
3
+ @item = item
4
+ @extract_field = extract_field
5
+
6
+ @to_extract = JSON.parse(to_extract)
7
+ @extract_term_fields = extract_term_fields
8
+ @case_sensitive = case_sensitive
9
+
10
+ @extract_dict = Hash.new
11
+ @save_field = save_field
12
+ @item_out = item
13
+ end
14
+
15
+ # Gets a list of terms to extract
16
+ def processHashInput
17
+ # Go through each item then each field
18
+ @to_extract.each do |ex_key, ex_value|
19
+ ex_value.each do |ex_field, ex_term|
20
+
21
+ # Check if it is the right field
22
+ if ex_field == @extract_term_fields || @extract_term_fields.include?(ex_field)
23
+ # Make dictionary of terms to extract and overall mapping
24
+ ex_term.is_a?(Array) ? processArrayInput(ex_term, ex_key) : @extract_dict[term] = ex_key
25
+ end
26
+
27
+ end
28
+ end
29
+ end
30
+
31
+ # Add all items in array to dictionary of terms to extract
32
+ def processArrayInput(extract_arr, map_val)
33
+ extract_arr.each do |term|
34
+ map_val = term if map_val == nil
35
+ @extract_dict[term] = map_val
36
+ end
37
+ end
38
+
39
+ # Check if the term appears in the text
40
+ def matchTerm?(term, text, case_sensitive)
41
+ # Downcase term and text if not case sensitive
42
+ if case_sensitive == false
43
+ term = term.downcase
44
+ text = text.downcase
45
+ end
46
+
47
+ # Return if it maches
48
+ if text.to_s.match(/\b(#{term})\b/)
49
+ return true
50
+ end
51
+ end
52
+
53
+ # Check if item is case sensitive
54
+ def isCaseSensitive?(term)
55
+ if @case_sensitive == "case-sensitive"
56
+ return true
57
+ elsif @case_sensitive == "case-insensitive"
58
+ return false
59
+ else
60
+ # Handle item by item variations
61
+ is_case_sensitive = @to_extract[term[1]][@case_sensitive]
62
+ if is_case_sensitive == "Yes"
63
+ return true
64
+ else return false
65
+ end
66
+ end
67
+ end
68
+
69
+ # Process input list and go through all terms and fields
70
+ def extractTerms
71
+ # Process input list
72
+ @to_extract.is_a?(Hash) ? processHashInput : processArrayInput(@to_extract, nil)
73
+ @item_out[@save_field] = Array.new
74
+
75
+ # Go through each term and field to check for matches
76
+ @extract_dict.each do |term|
77
+ item_case_sensitivity = isCaseSensitive?(term)
78
+ @extract_field.each do |field|
79
+
80
+ # Add to list of terms if it matches
81
+ if matchTerm?(term[0], @item[field], item_case_sensitivity)
82
+ @item_out[@save_field].push(term[1])
83
+ end
84
+
85
+ end
86
+ end
87
+
88
+ # Deduplicate and return
89
+ @item_out[@save_field].uniq!
90
+ return @item_out
91
+ end
92
+ end
@@ -0,0 +1,46 @@
1
+ require 'json'
2
+ load 'extract_set_terms.rb'
3
+
4
+ class TermExtractor
5
+ def initialize(input, extract_from, save_field)
6
+ @input = JSON.parse(input)
7
+ @extract_from = extract_from
8
+ @save_field = save_field
9
+
10
+ @output = Array.new
11
+ end
12
+
13
+ # Extracts set terms
14
+ def extractSetTerms(to_extract, extract_term_fields, case_sensitive)
15
+ @input.each do |item|
16
+ extract = ExtractSetTerms.new(item, @extract_from, to_extract, extract_term_fields, case_sensitive, @save_field)
17
+ @output.push(extract.extractTerms)
18
+ end
19
+ end
20
+
21
+ # Gets all results in output
22
+ def getAllOutput
23
+ JSON.pretty_generate(@output)
24
+ end
25
+
26
+ # Gets only the results for which terms were found/extracted
27
+ def getOnlyMatching
28
+ matches = @output.select { |item| !item[@save_field].empty? }
29
+ JSON.pretty_generate(matches)
30
+ end
31
+
32
+ # Gets a list of the extracted terms by how often they occur
33
+ def getTermList
34
+ counthash = Hash.new{0}
35
+
36
+ # Increments for each occurrence of term
37
+ @output.each do |item|
38
+ item[@save_field].each do |term|
39
+ counthash[term] += 1
40
+ end
41
+ end
42
+
43
+ # Return hash sorted by value
44
+ return Hash[counthash.sort_by { |k, v| v}]
45
+ end
46
+ end
metadata ADDED
@@ -0,0 +1,45 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: termextractor
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.16
5
+ platform: ruby
6
+ authors:
7
+ - M. C. McGrath
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-04-20 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: Extracts entities and terms from any JSON.
14
+ email: shidash@shidash.com
15
+ executables: []
16
+ extensions: []
17
+ extra_rdoc_files: []
18
+ files:
19
+ - lib/extract_set_terms.rb
20
+ - lib/term_extractor.rb
21
+ homepage: https://github.com/Shidash/EntityExtractor
22
+ licenses:
23
+ - GPL
24
+ metadata: {}
25
+ post_install_message:
26
+ rdoc_options: []
27
+ require_paths:
28
+ - lib
29
+ required_ruby_version: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ required_rubygems_version: !ruby/object:Gem::Requirement
35
+ requirements:
36
+ - - ">="
37
+ - !ruby/object:Gem::Version
38
+ version: '0'
39
+ requirements: []
40
+ rubyforge_project:
41
+ rubygems_version: 2.2.2
42
+ signing_key:
43
+ specification_version: 4
44
+ summary: Extracts entities and terms
45
+ test_files: []