termextractor 0.0.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: b96a8a2568c9b0f2bb0b17b07a6d0131b39ed0e5
4
+ data.tar.gz: 2f1612dfd0564eda138957fd50d0a6b058483b15
5
+ SHA512:
6
+ metadata.gz: 2b70ffec27aa30dc168f674175693bd1dd31f0ed044af7c10ea2a39591285a373768e8b8e85828e8e07f5ec38a2316763885178368f86b1acf1f3fa134610f9e
7
+ data.tar.gz: 0c0bb009fa834204d765965aca672b2f07cbef92f8821d3edae4aa1b58df73c7dd0f2bffb0fabe058794b58cf0afbb25175df46494a5880e88150298a5f9daf0
@@ -0,0 +1,92 @@
1
+ class ExtractSetTerms
2
+ def initialize(item, extract_field, to_extract, extract_term_fields, case_sensitive, save_field)
3
+ @item = item
4
+ @extract_field = extract_field
5
+
6
+ @to_extract = JSON.parse(to_extract)
7
+ @extract_term_fields = extract_term_fields
8
+ @case_sensitive = case_sensitive
9
+
10
+ @extract_dict = Hash.new
11
+ @save_field = save_field
12
+ @item_out = item
13
+ end
14
+
15
+ # Gets a list of terms to extract
16
+ def processHashInput
17
+ # Go through each item then each field
18
+ @to_extract.each do |ex_key, ex_value|
19
+ ex_value.each do |ex_field, ex_term|
20
+
21
+ # Check if it is the right field
22
+ if ex_field == @extract_term_fields || @extract_term_fields.include?(ex_field)
23
+ # Make dictionary of terms to extract and overall mapping
24
+ ex_term.is_a?(Array) ? processArrayInput(ex_term, ex_key) : @extract_dict[term] = ex_key
25
+ end
26
+
27
+ end
28
+ end
29
+ end
30
+
31
+ # Add all items in array to dictionary of terms to extract
32
+ def processArrayInput(extract_arr, map_val)
33
+ extract_arr.each do |term|
34
+ map_val = term if map_val == nil
35
+ @extract_dict[term] = map_val
36
+ end
37
+ end
38
+
39
+ # Check if the term appears in the text
40
+ def matchTerm?(term, text, case_sensitive)
41
+ # Downcase term and text if not case sensitive
42
+ if case_sensitive == false
43
+ term = term.downcase
44
+ text = text.downcase
45
+ end
46
+
47
+ # Return if it maches
48
+ if text.to_s.match(/\b(#{term})\b/)
49
+ return true
50
+ end
51
+ end
52
+
53
+ # Check if item is case sensitive
54
+ def isCaseSensitive?(term)
55
+ if @case_sensitive == "case-sensitive"
56
+ return true
57
+ elsif @case_sensitive == "case-insensitive"
58
+ return false
59
+ else
60
+ # Handle item by item variations
61
+ is_case_sensitive = @to_extract[term[1]][@case_sensitive]
62
+ if is_case_sensitive == "Yes"
63
+ return true
64
+ else return false
65
+ end
66
+ end
67
+ end
68
+
69
+ # Process input list and go through all terms and fields
70
+ def extractTerms
71
+ # Process input list
72
+ @to_extract.is_a?(Hash) ? processHashInput : processArrayInput(@to_extract, nil)
73
+ @item_out[@save_field] = Array.new
74
+
75
+ # Go through each term and field to check for matches
76
+ @extract_dict.each do |term|
77
+ item_case_sensitivity = isCaseSensitive?(term)
78
+ @extract_field.each do |field|
79
+
80
+ # Add to list of terms if it matches
81
+ if matchTerm?(term[0], @item[field], item_case_sensitivity)
82
+ @item_out[@save_field].push(term[1])
83
+ end
84
+
85
+ end
86
+ end
87
+
88
+ # Deduplicate and return
89
+ @item_out[@save_field].uniq!
90
+ return @item_out
91
+ end
92
+ end
@@ -0,0 +1,46 @@
1
+ require 'json'
2
+ load 'extract_set_terms.rb'
3
+
4
+ class TermExtractor
5
+ def initialize(input, extract_from, save_field)
6
+ @input = JSON.parse(input)
7
+ @extract_from = extract_from
8
+ @save_field = save_field
9
+
10
+ @output = Array.new
11
+ end
12
+
13
+ # Extracts set terms
14
+ def extractSetTerms(to_extract, extract_term_fields, case_sensitive)
15
+ @input.each do |item|
16
+ extract = ExtractSetTerms.new(item, @extract_from, to_extract, extract_term_fields, case_sensitive, @save_field)
17
+ @output.push(extract.extractTerms)
18
+ end
19
+ end
20
+
21
+ # Gets all results in output
22
+ def getAllOutput
23
+ JSON.pretty_generate(@output)
24
+ end
25
+
26
+ # Gets only the results for which terms were found/extracted
27
+ def getOnlyMatching
28
+ matches = @output.select { |item| !item[@save_field].empty? }
29
+ JSON.pretty_generate(matches)
30
+ end
31
+
32
+ # Gets a list of the extracted terms by how often they occur
33
+ def getTermList
34
+ counthash = Hash.new{0}
35
+
36
+ # Increments for each occurrence of term
37
+ @output.each do |item|
38
+ item[@save_field].each do |term|
39
+ counthash[term] += 1
40
+ end
41
+ end
42
+
43
+ # Return hash sorted by value
44
+ return Hash[counthash.sort_by { |k, v| v}]
45
+ end
46
+ end
metadata ADDED
@@ -0,0 +1,45 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: termextractor
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.16
5
+ platform: ruby
6
+ authors:
7
+ - M. C. McGrath
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-04-20 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: Extracts entities and terms from any JSON.
14
+ email: shidash@shidash.com
15
+ executables: []
16
+ extensions: []
17
+ extra_rdoc_files: []
18
+ files:
19
+ - lib/extract_set_terms.rb
20
+ - lib/term_extractor.rb
21
+ homepage: https://github.com/Shidash/EntityExtractor
22
+ licenses:
23
+ - GPL
24
+ metadata: {}
25
+ post_install_message:
26
+ rdoc_options: []
27
+ require_paths:
28
+ - lib
29
+ required_ruby_version: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ required_rubygems_version: !ruby/object:Gem::Requirement
35
+ requirements:
36
+ - - ">="
37
+ - !ruby/object:Gem::Version
38
+ version: '0'
39
+ requirements: []
40
+ rubyforge_project:
41
+ rubygems_version: 2.2.2
42
+ signing_key:
43
+ specification_version: 4
44
+ summary: Extracts entities and terms
45
+ test_files: []