extractpatterns 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/extractpatterns.rb +134 -0
- metadata +45 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA1:
|
|
3
|
+
metadata.gz: 24ecd7395c9c79e1f035a2420c1e8d6053816d65
|
|
4
|
+
data.tar.gz: 9d1f1b45a380ff2de4b5cf2b5e7150f943df29a8
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: 03b98db5070f0ada7452d5738d5c36eb8bdbb54a51b16e23bae28909fa017417ba62769fab87edee0be432f872b8f0fc5d106b105ced3f80bd6ffb271086f140
|
|
7
|
+
data.tar.gz: a1e84c75ba367661a8ca80625de90d4b4ee5875a9c47e75cd5072ddfd2b79e6b2302fcebca217dd4bb845e545805a179fa0cf9128c99da4d769f643a552d28f2
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
require 'json'
|
|
2
|
+
require 'pry'
|
|
3
|
+
require 'termextractor'
|
|
4
|
+
|
|
5
|
+
class ExtractPatterns
|
|
6
|
+
def initialize(input, fields, match_name)
|
|
7
|
+
@input = JSON.parse(input)
|
|
8
|
+
@fields = fields
|
|
9
|
+
@match_name = match_name
|
|
10
|
+
@output = Array.new
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
# Split to find matches
|
|
14
|
+
def comma_list_matches(value)
|
|
15
|
+
if value
|
|
16
|
+
# Split on commas
|
|
17
|
+
list_items = value.split(",")
|
|
18
|
+
|
|
19
|
+
# Only get items under certain num of words
|
|
20
|
+
list_items.reject!{ |item| item.split(" ").length > 2 }
|
|
21
|
+
|
|
22
|
+
# Clean whitespace and ands
|
|
23
|
+
return list_items.map { |match| match.gsub(" and", "").gsub("and ", "").gsub(".", "").strip.lstrip }
|
|
24
|
+
end
|
|
25
|
+
return []
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
# Get words in ALLCAPS past certain length
|
|
29
|
+
def get_allcaps(value, length)
|
|
30
|
+
if length && value
|
|
31
|
+
# Get all matches
|
|
32
|
+
matches = value.scan(/\b(?:[A-Z]|\s){#{length},}\b/)
|
|
33
|
+
|
|
34
|
+
# Remove matches that are too long
|
|
35
|
+
matches.reject!{|match| match.length > 100}
|
|
36
|
+
|
|
37
|
+
# Remove whitespace
|
|
38
|
+
return matches.map{ |match| match.strip.lstrip }
|
|
39
|
+
end
|
|
40
|
+
return []
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
# Extract set terms
|
|
44
|
+
def find_known_terms(item, field, extract_list)
|
|
45
|
+
d = TermExtractor.new(JSON.pretty_generate([item]), [field], "extracted_codewords")
|
|
46
|
+
d.extractSetTerms(File.read(extract_list), ["codeword"], "case_sensitive")
|
|
47
|
+
return JSON.parse(d.getAllOutput).first["extracted_codewords"]
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
# Normalize and match synonyms and deduplicate
|
|
51
|
+
def normalize_results(extracted_raw, synonym_list)
|
|
52
|
+
synonyms = JSON.parse(File.read(synonym_list))
|
|
53
|
+
outarr = extracted_raw.dup
|
|
54
|
+
|
|
55
|
+
# Go through all extracted
|
|
56
|
+
extracted_raw.each do |extracted|
|
|
57
|
+
# Go through each item in synonym list
|
|
58
|
+
synonyms.each do |key, value|
|
|
59
|
+
value["codeword"].each do |word|
|
|
60
|
+
# Match found!
|
|
61
|
+
if word.downcase == extracted.downcase
|
|
62
|
+
outarr.delete(extracted)
|
|
63
|
+
outarr.push(key)
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
# Return deduplicated
|
|
70
|
+
return outarr.uniq
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
# Go through all items in JSON and fields to search
|
|
74
|
+
def search_fields(allcaps_length, extract_list, merge_field)
|
|
75
|
+
# Extract from each item
|
|
76
|
+
@input.each do |item|
|
|
77
|
+
item[@match_name] = Array.new
|
|
78
|
+
|
|
79
|
+
@fields.each do |field|
|
|
80
|
+
# Extract list results, allcaps, and known codewords from each field
|
|
81
|
+
list_results = comma_list_matches(item[field])
|
|
82
|
+
allcaps_results = get_allcaps(item[field], allcaps_length)
|
|
83
|
+
merge_results = item[merge_field] ? item[merge_field] : []
|
|
84
|
+
known_terms_results = find_known_terms(item, field, extract_list)
|
|
85
|
+
|
|
86
|
+
# Merge results and post-process
|
|
87
|
+
item[@match_name] = item[@match_name] | normalize_results((allcaps_results | list_results | merge_results | known_terms_results),
|
|
88
|
+
extract_list)
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
# Push updated item out
|
|
92
|
+
@output.push(item)
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
return @output
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
# Return a ranked hash of the results
|
|
99
|
+
def ranked_hash_output(results)
|
|
100
|
+
# Make array of all results
|
|
101
|
+
allmatches = Array.new
|
|
102
|
+
results.each do |i|
|
|
103
|
+
i["tools_mentioned"].each do |match|
|
|
104
|
+
allmatches.push(match)
|
|
105
|
+
end
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
# Make ranked hash
|
|
109
|
+
rankedhash = Hash.new
|
|
110
|
+
allmatches.each do |match|
|
|
111
|
+
if rankedhash[match]
|
|
112
|
+
rankedhash[match] += 1
|
|
113
|
+
else
|
|
114
|
+
rankedhash[match] = 1
|
|
115
|
+
end
|
|
116
|
+
end
|
|
117
|
+
return rankedhash.sort_by{|k, v| v}
|
|
118
|
+
end
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
#dir = "/home/shidash/Data/unknown_test"
|
|
122
|
+
#overalloutput = Array.new
|
|
123
|
+
#Dir.foreach(dir) do |file|
|
|
124
|
+
# next if file == '.' or file == '..'
|
|
125
|
+
# if !File.directory?(dir+"/"+file) && file.include?(".json") && !file.include?(".json.gpg")
|
|
126
|
+
# e = ExtractPatterns.new(File.read(dir+"/"+file), ["additional_info", "job_description", "skills", "summary"], "tools_mentioned")
|
|
127
|
+
# results = e.search_fields(6, "extract_list.json", nil)
|
|
128
|
+
# File.write(file.gsub(".json", "_extracted.json"), JSON.pretty_generate(results))
|
|
129
|
+
# overalloutput.concat(results)
|
|
130
|
+
# end
|
|
131
|
+
#end
|
|
132
|
+
|
|
133
|
+
#e = ExtractPatterns.new(File.read("MECWEDB.json"), ["description", "summary"], "tools_mentioned")
|
|
134
|
+
#puts e.ranked_hash_output(overalloutput)
|
metadata
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: extractpatterns
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 0.0.1
|
|
5
|
+
platform: ruby
|
|
6
|
+
authors:
|
|
7
|
+
- M. C. McGrath
|
|
8
|
+
autorequire:
|
|
9
|
+
bindir: bin
|
|
10
|
+
cert_chain: []
|
|
11
|
+
date: 2015-12-25 00:00:00.000000000 Z
|
|
12
|
+
dependencies: []
|
|
13
|
+
description: Extracts entities and terms from any JSON.
|
|
14
|
+
email: shidash@shidash.com
|
|
15
|
+
executables: []
|
|
16
|
+
extensions: []
|
|
17
|
+
extra_rdoc_files: []
|
|
18
|
+
files:
|
|
19
|
+
- lib/extractpatterns.rb
|
|
20
|
+
homepage: https://github.com/transparencytoolkit/ExtractPatterns
|
|
21
|
+
licenses:
|
|
22
|
+
- GPL
|
|
23
|
+
metadata: {}
|
|
24
|
+
post_install_message:
|
|
25
|
+
rdoc_options: []
|
|
26
|
+
require_paths:
|
|
27
|
+
- lib
|
|
28
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
29
|
+
requirements:
|
|
30
|
+
- - ">="
|
|
31
|
+
- !ruby/object:Gem::Version
|
|
32
|
+
version: '0'
|
|
33
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
34
|
+
requirements:
|
|
35
|
+
- - ">="
|
|
36
|
+
- !ruby/object:Gem::Version
|
|
37
|
+
version: '0'
|
|
38
|
+
requirements: []
|
|
39
|
+
rubyforge_project:
|
|
40
|
+
rubygems_version: 2.4.8
|
|
41
|
+
signing_key:
|
|
42
|
+
specification_version: 4
|
|
43
|
+
summary: Extracts entities and terms
|
|
44
|
+
test_files: []
|
|
45
|
+
has_rdoc:
|