termextractor 0.0.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/extract_set_terms.rb +92 -0
- data/lib/term_extractor.rb +46 -0
- metadata +45 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: b96a8a2568c9b0f2bb0b17b07a6d0131b39ed0e5
|
4
|
+
data.tar.gz: 2f1612dfd0564eda138957fd50d0a6b058483b15
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 2b70ffec27aa30dc168f674175693bd1dd31f0ed044af7c10ea2a39591285a373768e8b8e85828e8e07f5ec38a2316763885178368f86b1acf1f3fa134610f9e
|
7
|
+
data.tar.gz: 0c0bb009fa834204d765965aca672b2f07cbef92f8821d3edae4aa1b58df73c7dd0f2bffb0fabe058794b58cf0afbb25175df46494a5880e88150298a5f9daf0
|
@@ -0,0 +1,92 @@
|
|
1
|
+
class ExtractSetTerms
|
2
|
+
def initialize(item, extract_field, to_extract, extract_term_fields, case_sensitive, save_field)
|
3
|
+
@item = item
|
4
|
+
@extract_field = extract_field
|
5
|
+
|
6
|
+
@to_extract = JSON.parse(to_extract)
|
7
|
+
@extract_term_fields = extract_term_fields
|
8
|
+
@case_sensitive = case_sensitive
|
9
|
+
|
10
|
+
@extract_dict = Hash.new
|
11
|
+
@save_field = save_field
|
12
|
+
@item_out = item
|
13
|
+
end
|
14
|
+
|
15
|
+
# Gets a list of terms to extract
|
16
|
+
def processHashInput
|
17
|
+
# Go through each item then each field
|
18
|
+
@to_extract.each do |ex_key, ex_value|
|
19
|
+
ex_value.each do |ex_field, ex_term|
|
20
|
+
|
21
|
+
# Check if it is the right field
|
22
|
+
if ex_field == @extract_term_fields || @extract_term_fields.include?(ex_field)
|
23
|
+
# Make dictionary of terms to extract and overall mapping
|
24
|
+
ex_term.is_a?(Array) ? processArrayInput(ex_term, ex_key) : @extract_dict[term] = ex_key
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
# Add all items in array to dictionary of terms to extract
|
32
|
+
def processArrayInput(extract_arr, map_val)
|
33
|
+
extract_arr.each do |term|
|
34
|
+
map_val = term if map_val == nil
|
35
|
+
@extract_dict[term] = map_val
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
# Check if the term appears in the text
|
40
|
+
def matchTerm?(term, text, case_sensitive)
|
41
|
+
# Downcase term and text if not case sensitive
|
42
|
+
if case_sensitive == false
|
43
|
+
term = term.downcase
|
44
|
+
text = text.downcase
|
45
|
+
end
|
46
|
+
|
47
|
+
# Return if it maches
|
48
|
+
if text.to_s.match(/\b(#{term})\b/)
|
49
|
+
return true
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
# Check if item is case sensitive
|
54
|
+
def isCaseSensitive?(term)
|
55
|
+
if @case_sensitive == "case-sensitive"
|
56
|
+
return true
|
57
|
+
elsif @case_sensitive == "case-insensitive"
|
58
|
+
return false
|
59
|
+
else
|
60
|
+
# Handle item by item variations
|
61
|
+
is_case_sensitive = @to_extract[term[1]][@case_sensitive]
|
62
|
+
if is_case_sensitive == "Yes"
|
63
|
+
return true
|
64
|
+
else return false
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
# Process input list and go through all terms and fields
|
70
|
+
def extractTerms
|
71
|
+
# Process input list
|
72
|
+
@to_extract.is_a?(Hash) ? processHashInput : processArrayInput(@to_extract, nil)
|
73
|
+
@item_out[@save_field] = Array.new
|
74
|
+
|
75
|
+
# Go through each term and field to check for matches
|
76
|
+
@extract_dict.each do |term|
|
77
|
+
item_case_sensitivity = isCaseSensitive?(term)
|
78
|
+
@extract_field.each do |field|
|
79
|
+
|
80
|
+
# Add to list of terms if it matches
|
81
|
+
if matchTerm?(term[0], @item[field], item_case_sensitivity)
|
82
|
+
@item_out[@save_field].push(term[1])
|
83
|
+
end
|
84
|
+
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
# Deduplicate and return
|
89
|
+
@item_out[@save_field].uniq!
|
90
|
+
return @item_out
|
91
|
+
end
|
92
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
require 'json'
|
2
|
+
load 'extract_set_terms.rb'
|
3
|
+
|
4
|
+
class TermExtractor
|
5
|
+
def initialize(input, extract_from, save_field)
|
6
|
+
@input = JSON.parse(input)
|
7
|
+
@extract_from = extract_from
|
8
|
+
@save_field = save_field
|
9
|
+
|
10
|
+
@output = Array.new
|
11
|
+
end
|
12
|
+
|
13
|
+
# Extracts set terms
|
14
|
+
def extractSetTerms(to_extract, extract_term_fields, case_sensitive)
|
15
|
+
@input.each do |item|
|
16
|
+
extract = ExtractSetTerms.new(item, @extract_from, to_extract, extract_term_fields, case_sensitive, @save_field)
|
17
|
+
@output.push(extract.extractTerms)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
# Gets all results in output
|
22
|
+
def getAllOutput
|
23
|
+
JSON.pretty_generate(@output)
|
24
|
+
end
|
25
|
+
|
26
|
+
# Gets only the results for which terms were found/extracted
|
27
|
+
def getOnlyMatching
|
28
|
+
matches = @output.select { |item| !item[@save_field].empty? }
|
29
|
+
JSON.pretty_generate(matches)
|
30
|
+
end
|
31
|
+
|
32
|
+
# Gets a list of the extracted terms by how often they occur
|
33
|
+
def getTermList
|
34
|
+
counthash = Hash.new{0}
|
35
|
+
|
36
|
+
# Increments for each occurrence of term
|
37
|
+
@output.each do |item|
|
38
|
+
item[@save_field].each do |term|
|
39
|
+
counthash[term] += 1
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
# Return hash sorted by value
|
44
|
+
return Hash[counthash.sort_by { |k, v| v}]
|
45
|
+
end
|
46
|
+
end
|
metadata
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: termextractor
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.16
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- M. C. McGrath
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2015-04-20 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: Extracts entities and terms from any JSON.
|
14
|
+
email: shidash@shidash.com
|
15
|
+
executables: []
|
16
|
+
extensions: []
|
17
|
+
extra_rdoc_files: []
|
18
|
+
files:
|
19
|
+
- lib/extract_set_terms.rb
|
20
|
+
- lib/term_extractor.rb
|
21
|
+
homepage: https://github.com/Shidash/EntityExtractor
|
22
|
+
licenses:
|
23
|
+
- GPL
|
24
|
+
metadata: {}
|
25
|
+
post_install_message:
|
26
|
+
rdoc_options: []
|
27
|
+
require_paths:
|
28
|
+
- lib
|
29
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
35
|
+
requirements:
|
36
|
+
- - ">="
|
37
|
+
- !ruby/object:Gem::Version
|
38
|
+
version: '0'
|
39
|
+
requirements: []
|
40
|
+
rubyforge_project:
|
41
|
+
rubygems_version: 2.2.2
|
42
|
+
signing_key:
|
43
|
+
specification_version: 4
|
44
|
+
summary: Extracts entities and terms
|
45
|
+
test_files: []
|