entityextractor 0.0.13 → 0.0.14
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/entityextractor.rb +9 -3
- data/lib/extractdates.rb +54 -40
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b3393233d6b5a8717d256f130c176b16bd2ed185
|
4
|
+
data.tar.gz: 546527bb55ee6d4af863236938460fa66aa4384a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 20aad2d3d3d0b63dbad195ae96e31b6569a624cc6e7da680a77a81510cfb59c3af97d61d813878a38f8bedaf02335b3712963c2c68d3c75dc01b73c439fa3058
|
7
|
+
data.tar.gz: 4b33fca6dc496a4f456928ff336a6ad16554e347f0702d22a72f45b11a021236f61c5e39ae438dbcf83885f935a6555b18a85ab2df7f3467d4698b754c181b40
|
data/lib/entityextractor.rb
CHANGED
@@ -96,7 +96,7 @@ class EntityExtractor
|
|
96
96
|
JSON.pretty_generate(@output)
|
97
97
|
end
|
98
98
|
|
99
|
-
def extract(type, minchar, ignoreterms, terms, ignorefields, caseinfo, mapto)
|
99
|
+
def extract(type, minchar, ignoreterms, terms, ignorefields, caseinfo, mapto, *append)
|
100
100
|
flag = 0
|
101
101
|
|
102
102
|
h = HandleInput.new(terms, ignorefields, caseinfo)
|
@@ -137,8 +137,14 @@ class EntityExtractor
|
|
137
137
|
elsif type == "date"
|
138
138
|
@extractfield.each do |f|
|
139
139
|
d = ExtractDates.new(i[f])
|
140
|
-
|
141
|
-
|
140
|
+
|
141
|
+
appendhash = Hash.new
|
142
|
+
append.each do |a|
|
143
|
+
appendhash[a] = i[a]
|
144
|
+
end
|
145
|
+
|
146
|
+
outhash = d.chunk(appendhash)
|
147
|
+
@output.push(outhash) if !outhash.empty?
|
142
148
|
end
|
143
149
|
|
144
150
|
# Extract both set terms and ALLCAPS
|
data/lib/extractdates.rb
CHANGED
@@ -10,63 +10,77 @@ class ExtractDates
|
|
10
10
|
@output = Array.new
|
11
11
|
end
|
12
12
|
|
13
|
-
def chunk(
|
14
|
-
if
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
s.each do |j|
|
20
|
-
dateExtract(j, file, j, i)
|
21
|
-
end
|
22
|
-
end
|
23
|
-
rescue
|
13
|
+
def chunk(append)
|
14
|
+
if !@text.empty?
|
15
|
+
i = @text
|
16
|
+
s = paragraph(i).segment
|
17
|
+
s.each do |j|
|
18
|
+
dateExtract(j, append, j, i)
|
24
19
|
end
|
25
20
|
end
|
21
|
+
|
26
22
|
return @output
|
27
23
|
end
|
28
24
|
|
29
|
-
|
25
|
+
# Finds matches for date formats in the blob from chunk(append)
|
26
|
+
def dateExtract(blob, append, title, description)
|
30
27
|
blobstring = blob.to_s
|
31
28
|
|
32
29
|
begin
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
addItem(
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
elsif blobstring.match(/(
|
49
|
-
save =
|
50
|
-
addItem(
|
51
|
-
|
52
|
-
|
53
|
-
|
30
|
+
# See below, but with yyyy-mm-dd (and months can only start with 0-1
|
31
|
+
if blobstring.match(/(?:19|20)\d{2}(?:-|\/)[0-1]?[0-9](?:-|\/)[0-3]?[0-9]/)
|
32
|
+
save = Regexp.last_match.to_s
|
33
|
+
saveparse = save.gsub("-", "/") # Needed for american_date gem
|
34
|
+
addItem(Date.parse(saveparse).to_s, append, title, description, blobstring, save)
|
35
|
+
|
36
|
+
# mm-dd-yyyy, mm/dd/yy, and similar. m or d must start with 0-3 (optional) and end in not 0
|
37
|
+
# Year can only start with 19 or 20 if it is four chars, or it could be 2 char
|
38
|
+
elsif blobstring.match(/[0-3]?[0-9](?:-|\/)[0-3]?[0-9](?:-|\/)(?:(?:19|20)\d{2}|\d{2})/)
|
39
|
+
save = Regexp.last_match.to_s
|
40
|
+
saveparse = save.gsub("-", "/")
|
41
|
+
addItem(Date.parse(saveparse).to_s, append, title, description, blobstring, save)
|
42
|
+
|
43
|
+
# Same as below but with dd before instead of in middle and supports two digit year
|
44
|
+
# Matches: dd Month yyyy, ddth Month yyyy, ddmonthyy ddthmonthyyyy
|
45
|
+
elsif blobstring.match(/(?:(?:[0-3]?[0-9])(?:st|nd|rd|th)?) *(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Nov(?:ember)?|Dec(?:ember)?) *(?:19|20)?\d{2}/i)
|
46
|
+
save = Regexp.last_match.to_s
|
47
|
+
addItem(Date.parse(save).to_s, append, title, description, blobstring, save)
|
48
|
+
|
49
|
+
# Matches: Month yyyy, Month dd yyyy, Month ddth yyyy, Month dd, yyyy, Month ddth, yyyy,
|
50
|
+
# Month can be full month or abbreviation, optional dd with 1 optional th/st/nd/rd, yyyy starting in 19 or 20
|
51
|
+
# Case insensitive, optional/variable spaces
|
52
|
+
elsif blobstring.match(/(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Nov(?:ember)?|Dec(?:ember)?) *(?:(?:[0-3]?[0-9])(?:st|nd|rd|th)?)?,? *(?:19|20)\d{2}/i)
|
53
|
+
save = Regexp.last_match.to_s
|
54
|
+
addItem(Date.parse(save).to_s, append, title, description, blobstring, save)
|
55
|
+
|
56
|
+
# Matches: yyyy
|
57
|
+
# Must start and end with word boundry, year must start with 19 or 20 and be 4 numbers
|
58
|
+
elsif blobstring.match(/\b(?:19|20)\d{2}\b/)
|
59
|
+
save = Regexp.last_match.to_s
|
60
|
+
addItem(Date.parse(Date.new(save.to_i).to_s), append, title, description, blobstring, save)
|
54
61
|
end
|
62
|
+
|
55
63
|
rescue
|
56
64
|
end
|
57
65
|
end
|
58
66
|
|
59
67
|
# Adds and item to the hash
|
60
|
-
def addItem(date,
|
68
|
+
def addItem(date, append, title, description, blob, regex)
|
61
69
|
shash = Hash.new
|
62
|
-
shash[:
|
63
|
-
shash[:
|
64
|
-
shash[:
|
65
|
-
|
70
|
+
shash[:parsed_date] = date
|
71
|
+
shash[:raw_date] = regex
|
72
|
+
shash[:short_chunk] = title
|
73
|
+
|
74
|
+
# Append fields specified
|
75
|
+
unless append == {nil=>nil}
|
76
|
+
append.each do |k, v|
|
77
|
+
shash[k] = v
|
78
|
+
end
|
79
|
+
end
|
66
80
|
|
67
81
|
flag = 0
|
68
82
|
@output.each do |o|
|
69
|
-
if (o[:
|
83
|
+
if (o[:parsed_date] == shash[:parsed_date]) && (o[:short_chunk].to_s == shash[:short_chunk].to_s)
|
70
84
|
flag = 1
|
71
85
|
break
|
72
86
|
end
|
@@ -77,7 +91,7 @@ class ExtractDates
|
|
77
91
|
end
|
78
92
|
|
79
93
|
blob.slice! regex
|
80
|
-
dateExtract(blob,
|
94
|
+
dateExtract(blob, append, title, description)
|
81
95
|
end
|
82
96
|
end
|
83
97
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: entityextractor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.14
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- M. C. McGrath
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-08-
|
11
|
+
date: 2014-08-22 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: Extracts entities and terms from any JSON.
|
14
14
|
email: shidash@shidash.com
|