extractpatterns 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +7 -0
  2. data/lib/extractpatterns.rb +134 -0
  3. metadata +45 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 24ecd7395c9c79e1f035a2420c1e8d6053816d65
4
+ data.tar.gz: 9d1f1b45a380ff2de4b5cf2b5e7150f943df29a8
5
+ SHA512:
6
+ metadata.gz: 03b98db5070f0ada7452d5738d5c36eb8bdbb54a51b16e23bae28909fa017417ba62769fab87edee0be432f872b8f0fc5d106b105ced3f80bd6ffb271086f140
7
+ data.tar.gz: a1e84c75ba367661a8ca80625de90d4b4ee5875a9c47e75cd5072ddfd2b79e6b2302fcebca217dd4bb845e545805a179fa0cf9128c99da4d769f643a552d28f2
@@ -0,0 +1,134 @@
1
+ require 'json'
2
+ require 'pry'
3
+ require 'termextractor'
4
+
5
+ class ExtractPatterns
6
+ def initialize(input, fields, match_name)
7
+ @input = JSON.parse(input)
8
+ @fields = fields
9
+ @match_name = match_name
10
+ @output = Array.new
11
+ end
12
+
13
+ # Split to find matches
14
+ def comma_list_matches(value)
15
+ if value
16
+ # Split on commas
17
+ list_items = value.split(",")
18
+
19
+ # Only get items under certain num of words
20
+ list_items.reject!{ |item| item.split(" ").length > 2 }
21
+
22
+ # Clean whitespace and ands
23
+ return list_items.map { |match| match.gsub(" and", "").gsub("and ", "").gsub(".", "").strip.lstrip }
24
+ end
25
+ return []
26
+ end
27
+
28
+ # Get words in ALLCAPS past certain length
29
+ def get_allcaps(value, length)
30
+ if length && value
31
+ # Get all matches
32
+ matches = value.scan(/\b(?:[A-Z]|\s){#{length},}\b/)
33
+
34
+ # Remove matches that are too long
35
+ matches.reject!{|match| match.length > 100}
36
+
37
+ # Remove whitespace
38
+ return matches.map{ |match| match.strip.lstrip }
39
+ end
40
+ return []
41
+ end
42
+
43
+ # Extract set terms
44
+ def find_known_terms(item, field, extract_list)
45
+ d = TermExtractor.new(JSON.pretty_generate([item]), [field], "extracted_codewords")
46
+ d.extractSetTerms(File.read(extract_list), ["codeword"], "case_sensitive")
47
+ return JSON.parse(d.getAllOutput).first["extracted_codewords"]
48
+ end
49
+
50
+ # Normalize and match synonyms and deduplicate
51
+ def normalize_results(extracted_raw, synonym_list)
52
+ synonyms = JSON.parse(File.read(synonym_list))
53
+ outarr = extracted_raw.dup
54
+
55
+ # Go through all extracted
56
+ extracted_raw.each do |extracted|
57
+ # Go through each item in synonym list
58
+ synonyms.each do |key, value|
59
+ value["codeword"].each do |word|
60
+ # Match found!
61
+ if word.downcase == extracted.downcase
62
+ outarr.delete(extracted)
63
+ outarr.push(key)
64
+ end
65
+ end
66
+ end
67
+ end
68
+
69
+ # Return deduplicated
70
+ return outarr.uniq
71
+ end
72
+
73
+ # Go through all items in JSON and fields to search
74
+ def search_fields(allcaps_length, extract_list, merge_field)
75
+ # Extract from each item
76
+ @input.each do |item|
77
+ item[@match_name] = Array.new
78
+
79
+ @fields.each do |field|
80
+ # Extract list results, allcaps, and known codewords from each field
81
+ list_results = comma_list_matches(item[field])
82
+ allcaps_results = get_allcaps(item[field], allcaps_length)
83
+ merge_results = item[merge_field] ? item[merge_field] : []
84
+ known_terms_results = find_known_terms(item, field, extract_list)
85
+
86
+ # Merge results and post-process
87
+ item[@match_name] = item[@match_name] | normalize_results((allcaps_results | list_results | merge_results | known_terms_results),
88
+ extract_list)
89
+ end
90
+
91
+ # Push updated item out
92
+ @output.push(item)
93
+ end
94
+
95
+ return @output
96
+ end
97
+
98
+ # Return a ranked hash of the results
99
+ def ranked_hash_output(results)
100
+ # Make array of all results
101
+ allmatches = Array.new
102
+ results.each do |i|
103
+ i["tools_mentioned"].each do |match|
104
+ allmatches.push(match)
105
+ end
106
+ end
107
+
108
+ # Make ranked hash
109
+ rankedhash = Hash.new
110
+ allmatches.each do |match|
111
+ if rankedhash[match]
112
+ rankedhash[match] += 1
113
+ else
114
+ rankedhash[match] = 1
115
+ end
116
+ end
117
+ return rankedhash.sort_by{|k, v| v}
118
+ end
119
+ end
120
+
121
+ #dir = "/home/shidash/Data/unknown_test"
122
+ #overalloutput = Array.new
123
+ #Dir.foreach(dir) do |file|
124
+ # next if file == '.' or file == '..'
125
+ # if !File.directory?(dir+"/"+file) && file.include?(".json") && !file.include?(".json.gpg")
126
+ # e = ExtractPatterns.new(File.read(dir+"/"+file), ["additional_info", "job_description", "skills", "summary"], "tools_mentioned")
127
+ # results = e.search_fields(6, "extract_list.json", nil)
128
+ # File.write(file.gsub(".json", "_extracted.json"), JSON.pretty_generate(results))
129
+ # overalloutput.concat(results)
130
+ # end
131
+ #end
132
+
133
+ #e = ExtractPatterns.new(File.read("MECWEDB.json"), ["description", "summary"], "tools_mentioned")
134
+ #puts e.ranked_hash_output(overalloutput)
metadata ADDED
@@ -0,0 +1,45 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: extractpatterns
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - M. C. McGrath
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-12-25 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: Extracts entities and terms from any JSON.
14
+ email: shidash@shidash.com
15
+ executables: []
16
+ extensions: []
17
+ extra_rdoc_files: []
18
+ files:
19
+ - lib/extractpatterns.rb
20
+ homepage: https://github.com/transparencytoolkit/ExtractPatterns
21
+ licenses:
22
+ - GPL
23
+ metadata: {}
24
+ post_install_message:
25
+ rdoc_options: []
26
+ require_paths:
27
+ - lib
28
+ required_ruby_version: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ required_rubygems_version: !ruby/object:Gem::Requirement
34
+ requirements:
35
+ - - ">="
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ requirements: []
39
+ rubyforge_project:
40
+ rubygems_version: 2.4.8
41
+ signing_key:
42
+ specification_version: 4
43
+ summary: Extracts entities and terms
44
+ test_files: []
45
+ has_rdoc: