extractpatterns 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/lib/extractpatterns.rb +134 -0
- metadata +45 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 24ecd7395c9c79e1f035a2420c1e8d6053816d65
|
4
|
+
data.tar.gz: 9d1f1b45a380ff2de4b5cf2b5e7150f943df29a8
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 03b98db5070f0ada7452d5738d5c36eb8bdbb54a51b16e23bae28909fa017417ba62769fab87edee0be432f872b8f0fc5d106b105ced3f80bd6ffb271086f140
|
7
|
+
data.tar.gz: a1e84c75ba367661a8ca80625de90d4b4ee5875a9c47e75cd5072ddfd2b79e6b2302fcebca217dd4bb845e545805a179fa0cf9128c99da4d769f643a552d28f2
|
@@ -0,0 +1,134 @@
|
|
1
|
+
require 'json'
|
2
|
+
require 'pry'
|
3
|
+
require 'termextractor'
|
4
|
+
|
5
|
+
class ExtractPatterns
|
6
|
+
def initialize(input, fields, match_name)
|
7
|
+
@input = JSON.parse(input)
|
8
|
+
@fields = fields
|
9
|
+
@match_name = match_name
|
10
|
+
@output = Array.new
|
11
|
+
end
|
12
|
+
|
13
|
+
# Split to find matches
|
14
|
+
def comma_list_matches(value)
|
15
|
+
if value
|
16
|
+
# Split on commas
|
17
|
+
list_items = value.split(",")
|
18
|
+
|
19
|
+
# Only get items under certain num of words
|
20
|
+
list_items.reject!{ |item| item.split(" ").length > 2 }
|
21
|
+
|
22
|
+
# Clean whitespace and ands
|
23
|
+
return list_items.map { |match| match.gsub(" and", "").gsub("and ", "").gsub(".", "").strip.lstrip }
|
24
|
+
end
|
25
|
+
return []
|
26
|
+
end
|
27
|
+
|
28
|
+
# Get words in ALLCAPS past certain length
|
29
|
+
def get_allcaps(value, length)
|
30
|
+
if length && value
|
31
|
+
# Get all matches
|
32
|
+
matches = value.scan(/\b(?:[A-Z]|\s){#{length},}\b/)
|
33
|
+
|
34
|
+
# Remove matches that are too long
|
35
|
+
matches.reject!{|match| match.length > 100}
|
36
|
+
|
37
|
+
# Remove whitespace
|
38
|
+
return matches.map{ |match| match.strip.lstrip }
|
39
|
+
end
|
40
|
+
return []
|
41
|
+
end
|
42
|
+
|
43
|
+
# Extract set terms
|
44
|
+
def find_known_terms(item, field, extract_list)
|
45
|
+
d = TermExtractor.new(JSON.pretty_generate([item]), [field], "extracted_codewords")
|
46
|
+
d.extractSetTerms(File.read(extract_list), ["codeword"], "case_sensitive")
|
47
|
+
return JSON.parse(d.getAllOutput).first["extracted_codewords"]
|
48
|
+
end
|
49
|
+
|
50
|
+
# Normalize and match synonyms and deduplicate
|
51
|
+
def normalize_results(extracted_raw, synonym_list)
|
52
|
+
synonyms = JSON.parse(File.read(synonym_list))
|
53
|
+
outarr = extracted_raw.dup
|
54
|
+
|
55
|
+
# Go through all extracted
|
56
|
+
extracted_raw.each do |extracted|
|
57
|
+
# Go through each item in synonym list
|
58
|
+
synonyms.each do |key, value|
|
59
|
+
value["codeword"].each do |word|
|
60
|
+
# Match found!
|
61
|
+
if word.downcase == extracted.downcase
|
62
|
+
outarr.delete(extracted)
|
63
|
+
outarr.push(key)
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
# Return deduplicated
|
70
|
+
return outarr.uniq
|
71
|
+
end
|
72
|
+
|
73
|
+
# Go through all items in JSON and fields to search
|
74
|
+
def search_fields(allcaps_length, extract_list, merge_field)
|
75
|
+
# Extract from each item
|
76
|
+
@input.each do |item|
|
77
|
+
item[@match_name] = Array.new
|
78
|
+
|
79
|
+
@fields.each do |field|
|
80
|
+
# Extract list results, allcaps, and known codewords from each field
|
81
|
+
list_results = comma_list_matches(item[field])
|
82
|
+
allcaps_results = get_allcaps(item[field], allcaps_length)
|
83
|
+
merge_results = item[merge_field] ? item[merge_field] : []
|
84
|
+
known_terms_results = find_known_terms(item, field, extract_list)
|
85
|
+
|
86
|
+
# Merge results and post-process
|
87
|
+
item[@match_name] = item[@match_name] | normalize_results((allcaps_results | list_results | merge_results | known_terms_results),
|
88
|
+
extract_list)
|
89
|
+
end
|
90
|
+
|
91
|
+
# Push updated item out
|
92
|
+
@output.push(item)
|
93
|
+
end
|
94
|
+
|
95
|
+
return @output
|
96
|
+
end
|
97
|
+
|
98
|
+
# Return a ranked hash of the results
|
99
|
+
def ranked_hash_output(results)
|
100
|
+
# Make array of all results
|
101
|
+
allmatches = Array.new
|
102
|
+
results.each do |i|
|
103
|
+
i["tools_mentioned"].each do |match|
|
104
|
+
allmatches.push(match)
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
# Make ranked hash
|
109
|
+
rankedhash = Hash.new
|
110
|
+
allmatches.each do |match|
|
111
|
+
if rankedhash[match]
|
112
|
+
rankedhash[match] += 1
|
113
|
+
else
|
114
|
+
rankedhash[match] = 1
|
115
|
+
end
|
116
|
+
end
|
117
|
+
return rankedhash.sort_by{|k, v| v}
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
#dir = "/home/shidash/Data/unknown_test"
|
122
|
+
#overalloutput = Array.new
|
123
|
+
#Dir.foreach(dir) do |file|
|
124
|
+
# next if file == '.' or file == '..'
|
125
|
+
# if !File.directory?(dir+"/"+file) && file.include?(".json") && !file.include?(".json.gpg")
|
126
|
+
# e = ExtractPatterns.new(File.read(dir+"/"+file), ["additional_info", "job_description", "skills", "summary"], "tools_mentioned")
|
127
|
+
# results = e.search_fields(6, "extract_list.json", nil)
|
128
|
+
# File.write(file.gsub(".json", "_extracted.json"), JSON.pretty_generate(results))
|
129
|
+
# overalloutput.concat(results)
|
130
|
+
# end
|
131
|
+
#end
|
132
|
+
|
133
|
+
#e = ExtractPatterns.new(File.read("MECWEDB.json"), ["description", "summary"], "tools_mentioned")
|
134
|
+
#puts e.ranked_hash_output(overalloutput)
|
metadata
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: extractpatterns
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- M. C. McGrath
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2015-12-25 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: Extracts entities and terms from any JSON.
|
14
|
+
email: shidash@shidash.com
|
15
|
+
executables: []
|
16
|
+
extensions: []
|
17
|
+
extra_rdoc_files: []
|
18
|
+
files:
|
19
|
+
- lib/extractpatterns.rb
|
20
|
+
homepage: https://github.com/transparencytoolkit/ExtractPatterns
|
21
|
+
licenses:
|
22
|
+
- GPL
|
23
|
+
metadata: {}
|
24
|
+
post_install_message:
|
25
|
+
rdoc_options: []
|
26
|
+
require_paths:
|
27
|
+
- lib
|
28
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
29
|
+
requirements:
|
30
|
+
- - ">="
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '0'
|
33
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
34
|
+
requirements:
|
35
|
+
- - ">="
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
requirements: []
|
39
|
+
rubyforge_project:
|
40
|
+
rubygems_version: 2.4.8
|
41
|
+
signing_key:
|
42
|
+
specification_version: 4
|
43
|
+
summary: Extracts entities and terms
|
44
|
+
test_files: []
|
45
|
+
has_rdoc:
|