extractpatterns 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/extractpatterns.rb +10 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b4839a2718581a04fd0fba727ebe49a555d0e429
|
4
|
+
data.tar.gz: 6ff61dbdb39ca8db3d994a928ead04b90924706a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b227fcfdb2fbca4a5bb8c127d7b293d85ebcc738a1c0ad6b23fd86115b3d7439b8ba65459b1eac1835f3fe2358f6b9bd25ef74c42f4605c0ec233c21950c9ad4
|
7
|
+
data.tar.gz: 55f286d1d433963fd58c94527de287268e69e6e7f229eb9bb6ae5d13a5396c7a6daea2ab78ae9668b29e34696a820c143fa113e5843cb6589b2c9753941ad40e
|
data/lib/extractpatterns.rb
CHANGED
@@ -43,10 +43,19 @@ class ExtractPatterns
|
|
43
43
|
# Extract set terms
|
44
44
|
def find_known_terms(item, field, extract_list)
|
45
45
|
d = TermExtractor.new(JSON.pretty_generate([item]), [field], "extracted_codewords")
|
46
|
-
d.extractSetTerms(File.read(extract_list), ["codeword"], "case_sensitive")
|
46
|
+
d.extractSetTerms(fixEncode(File.read(extract_list)), ["codeword"], "case_sensitive")
|
47
47
|
return JSON.parse(d.getAllOutput).first["extracted_codewords"]
|
48
48
|
end
|
49
49
|
|
50
|
+
# Fix encoding errors
|
51
|
+
def fixEncode(str)
|
52
|
+
if str.is_a?(String)
|
53
|
+
return str.unpack('C*').pack('U*')
|
54
|
+
else
|
55
|
+
return str
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
50
59
|
# Normalize and match synonyms and deduplicate
|
51
60
|
def normalize_results(extracted_raw, synonym_list)
|
52
61
|
synonyms = JSON.parse(File.read(synonym_list))
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: extractpatterns
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- M. C. McGrath
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-12-
|
11
|
+
date: 2015-12-28 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: Extracts entities and terms from any JSON.
|
14
14
|
email: shidash@shidash.com
|