extractpatterns 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +7 -0
  2. data/lib/extractpatterns.rb +134 -0
  3. metadata +45 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 24ecd7395c9c79e1f035a2420c1e8d6053816d65
4
+ data.tar.gz: 9d1f1b45a380ff2de4b5cf2b5e7150f943df29a8
5
+ SHA512:
6
+ metadata.gz: 03b98db5070f0ada7452d5738d5c36eb8bdbb54a51b16e23bae28909fa017417ba62769fab87edee0be432f872b8f0fc5d106b105ced3f80bd6ffb271086f140
7
+ data.tar.gz: a1e84c75ba367661a8ca80625de90d4b4ee5875a9c47e75cd5072ddfd2b79e6b2302fcebca217dd4bb845e545805a179fa0cf9128c99da4d769f643a552d28f2
@@ -0,0 +1,134 @@
1
+ require 'json'
2
+ require 'pry'
3
+ require 'termextractor'
4
+
5
+ class ExtractPatterns
6
+ def initialize(input, fields, match_name)
7
+ @input = JSON.parse(input)
8
+ @fields = fields
9
+ @match_name = match_name
10
+ @output = Array.new
11
+ end
12
+
13
+ # Split to find matches
14
+ def comma_list_matches(value)
15
+ if value
16
+ # Split on commas
17
+ list_items = value.split(",")
18
+
19
+ # Only get items under certain num of words
20
+ list_items.reject!{ |item| item.split(" ").length > 2 }
21
+
22
+ # Clean whitespace and ands
23
+ return list_items.map { |match| match.gsub(" and", "").gsub("and ", "").gsub(".", "").strip.lstrip }
24
+ end
25
+ return []
26
+ end
27
+
28
+ # Get words in ALLCAPS past certain length
29
+ def get_allcaps(value, length)
30
+ if length && value
31
+ # Get all matches
32
+ matches = value.scan(/\b(?:[A-Z]|\s){#{length},}\b/)
33
+
34
+ # Remove matches that are too long
35
+ matches.reject!{|match| match.length > 100}
36
+
37
+ # Remove whitespace
38
+ return matches.map{ |match| match.strip.lstrip }
39
+ end
40
+ return []
41
+ end
42
+
43
+ # Extract set terms
44
+ def find_known_terms(item, field, extract_list)
45
+ d = TermExtractor.new(JSON.pretty_generate([item]), [field], "extracted_codewords")
46
+ d.extractSetTerms(File.read(extract_list), ["codeword"], "case_sensitive")
47
+ return JSON.parse(d.getAllOutput).first["extracted_codewords"]
48
+ end
49
+
50
+ # Normalize and match synonyms and deduplicate
51
+ def normalize_results(extracted_raw, synonym_list)
52
+ synonyms = JSON.parse(File.read(synonym_list))
53
+ outarr = extracted_raw.dup
54
+
55
+ # Go through all extracted
56
+ extracted_raw.each do |extracted|
57
+ # Go through each item in synonym list
58
+ synonyms.each do |key, value|
59
+ value["codeword"].each do |word|
60
+ # Match found!
61
+ if word.downcase == extracted.downcase
62
+ outarr.delete(extracted)
63
+ outarr.push(key)
64
+ end
65
+ end
66
+ end
67
+ end
68
+
69
+ # Return deduplicated
70
+ return outarr.uniq
71
+ end
72
+
73
+ # Go through all items in JSON and fields to search
74
+ def search_fields(allcaps_length, extract_list, merge_field)
75
+ # Extract from each item
76
+ @input.each do |item|
77
+ item[@match_name] = Array.new
78
+
79
+ @fields.each do |field|
80
+ # Extract list results, allcaps, and known codewords from each field
81
+ list_results = comma_list_matches(item[field])
82
+ allcaps_results = get_allcaps(item[field], allcaps_length)
83
+ merge_results = item[merge_field] ? item[merge_field] : []
84
+ known_terms_results = find_known_terms(item, field, extract_list)
85
+
86
+ # Merge results and post-process
87
+ item[@match_name] = item[@match_name] | normalize_results((allcaps_results | list_results | merge_results | known_terms_results),
88
+ extract_list)
89
+ end
90
+
91
+ # Push updated item out
92
+ @output.push(item)
93
+ end
94
+
95
+ return @output
96
+ end
97
+
98
+ # Return a ranked hash of the results
99
+ def ranked_hash_output(results)
100
+ # Make array of all results
101
+ allmatches = Array.new
102
+ results.each do |i|
103
+ i["tools_mentioned"].each do |match|
104
+ allmatches.push(match)
105
+ end
106
+ end
107
+
108
+ # Make ranked hash
109
+ rankedhash = Hash.new
110
+ allmatches.each do |match|
111
+ if rankedhash[match]
112
+ rankedhash[match] += 1
113
+ else
114
+ rankedhash[match] = 1
115
+ end
116
+ end
117
+ return rankedhash.sort_by{|k, v| v}
118
+ end
119
+ end
120
+
121
+ #dir = "/home/shidash/Data/unknown_test"
122
+ #overalloutput = Array.new
123
+ #Dir.foreach(dir) do |file|
124
+ # next if file == '.' or file == '..'
125
+ # if !File.directory?(dir+"/"+file) && file.include?(".json") && !file.include?(".json.gpg")
126
+ # e = ExtractPatterns.new(File.read(dir+"/"+file), ["additional_info", "job_description", "skills", "summary"], "tools_mentioned")
127
+ # results = e.search_fields(6, "extract_list.json", nil)
128
+ # File.write(file.gsub(".json", "_extracted.json"), JSON.pretty_generate(results))
129
+ # overalloutput.concat(results)
130
+ # end
131
+ #end
132
+
133
+ #e = ExtractPatterns.new(File.read("MECWEDB.json"), ["description", "summary"], "tools_mentioned")
134
+ #puts e.ranked_hash_output(overalloutput)
metadata ADDED
@@ -0,0 +1,45 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: extractpatterns
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - M. C. McGrath
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-12-25 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: Extracts entities and terms from any JSON.
14
+ email: shidash@shidash.com
15
+ executables: []
16
+ extensions: []
17
+ extra_rdoc_files: []
18
+ files:
19
+ - lib/extractpatterns.rb
20
+ homepage: https://github.com/transparencytoolkit/ExtractPatterns
21
+ licenses:
22
+ - GPL
23
+ metadata: {}
24
+ post_install_message:
25
+ rdoc_options: []
26
+ require_paths:
27
+ - lib
28
+ required_ruby_version: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ required_rubygems_version: !ruby/object:Gem::Requirement
34
+ requirements:
35
+ - - ">="
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ requirements: []
39
+ rubyforge_project:
40
+ rubygems_version: 2.4.8
41
+ signing_key:
42
+ specification_version: 4
43
+ summary: Extracts entities and terms
44
+ test_files: []
45
+ has_rdoc: