entityextractor 0.0.13 → 0.0.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/entityextractor.rb +9 -3
- data/lib/extractdates.rb +54 -40
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b3393233d6b5a8717d256f130c176b16bd2ed185
|
4
|
+
data.tar.gz: 546527bb55ee6d4af863236938460fa66aa4384a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 20aad2d3d3d0b63dbad195ae96e31b6569a624cc6e7da680a77a81510cfb59c3af97d61d813878a38f8bedaf02335b3712963c2c68d3c75dc01b73c439fa3058
|
7
|
+
data.tar.gz: 4b33fca6dc496a4f456928ff336a6ad16554e347f0702d22a72f45b11a021236f61c5e39ae438dbcf83885f935a6555b18a85ab2df7f3467d4698b754c181b40
|
data/lib/entityextractor.rb
CHANGED
@@ -96,7 +96,7 @@ class EntityExtractor
|
|
96
96
|
JSON.pretty_generate(@output)
|
97
97
|
end
|
98
98
|
|
99
|
-
def extract(type, minchar, ignoreterms, terms, ignorefields, caseinfo, mapto)
|
99
|
+
def extract(type, minchar, ignoreterms, terms, ignorefields, caseinfo, mapto, *append)
|
100
100
|
flag = 0
|
101
101
|
|
102
102
|
h = HandleInput.new(terms, ignorefields, caseinfo)
|
@@ -137,8 +137,14 @@ class EntityExtractor
|
|
137
137
|
elsif type == "date"
|
138
138
|
@extractfield.each do |f|
|
139
139
|
d = ExtractDates.new(i[f])
|
140
|
-
|
141
|
-
|
140
|
+
|
141
|
+
appendhash = Hash.new
|
142
|
+
append.each do |a|
|
143
|
+
appendhash[a] = i[a]
|
144
|
+
end
|
145
|
+
|
146
|
+
outhash = d.chunk(appendhash)
|
147
|
+
@output.push(outhash) if !outhash.empty?
|
142
148
|
end
|
143
149
|
|
144
150
|
# Extract both set terms and ALLCAPS
|
data/lib/extractdates.rb
CHANGED
@@ -10,63 +10,77 @@ class ExtractDates
|
|
10
10
|
@output = Array.new
|
11
11
|
end
|
12
12
|
|
13
|
-
def chunk(
|
14
|
-
if
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
s.each do |j|
|
20
|
-
dateExtract(j, file, j, i)
|
21
|
-
end
|
22
|
-
end
|
23
|
-
rescue
|
13
|
+
def chunk(append)
|
14
|
+
if !@text.empty?
|
15
|
+
i = @text
|
16
|
+
s = paragraph(i).segment
|
17
|
+
s.each do |j|
|
18
|
+
dateExtract(j, append, j, i)
|
24
19
|
end
|
25
20
|
end
|
21
|
+
|
26
22
|
return @output
|
27
23
|
end
|
28
24
|
|
29
|
-
|
25
|
+
# Finds matches for date formats in the blob from chunk(append)
|
26
|
+
def dateExtract(blob, append, title, description)
|
30
27
|
blobstring = blob.to_s
|
31
28
|
|
32
29
|
begin
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
addItem(
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
elsif blobstring.match(/(
|
49
|
-
save =
|
50
|
-
addItem(
|
51
|
-
|
52
|
-
|
53
|
-
|
30
|
+
# See below, but with yyyy-mm-dd (and months can only start with 0-1
|
31
|
+
if blobstring.match(/(?:19|20)\d{2}(?:-|\/)[0-1]?[0-9](?:-|\/)[0-3]?[0-9]/)
|
32
|
+
save = Regexp.last_match.to_s
|
33
|
+
saveparse = save.gsub("-", "/") # Needed for american_date gem
|
34
|
+
addItem(Date.parse(saveparse).to_s, append, title, description, blobstring, save)
|
35
|
+
|
36
|
+
# mm-dd-yyyy, mm/dd/yy, and similar. m or d must start with 0-3 (optional) and end in not 0
|
37
|
+
# Year can only start with 19 or 20 if it is four chars, or it could be 2 char
|
38
|
+
elsif blobstring.match(/[0-3]?[0-9](?:-|\/)[0-3]?[0-9](?:-|\/)(?:(?:19|20)\d{2}|\d{2})/)
|
39
|
+
save = Regexp.last_match.to_s
|
40
|
+
saveparse = save.gsub("-", "/")
|
41
|
+
addItem(Date.parse(saveparse).to_s, append, title, description, blobstring, save)
|
42
|
+
|
43
|
+
# Same as below but with dd before instead of in middle and supports two digit year
|
44
|
+
# Matches: dd Month yyyy, ddth Month yyyy, ddmonthyy ddthmonthyyyy
|
45
|
+
elsif blobstring.match(/(?:(?:[0-3]?[0-9])(?:st|nd|rd|th)?) *(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Nov(?:ember)?|Dec(?:ember)?) *(?:19|20)?\d{2}/i)
|
46
|
+
save = Regexp.last_match.to_s
|
47
|
+
addItem(Date.parse(save).to_s, append, title, description, blobstring, save)
|
48
|
+
|
49
|
+
# Matches: Month yyyy, Month dd yyyy, Month ddth yyyy, Month dd, yyyy, Month ddth, yyyy,
|
50
|
+
# Month can be full month or abbreviation, optional dd with 1 optional th/st/nd/rd, yyyy starting in 19 or 20
|
51
|
+
# Case insensitive, optional/variable spaces
|
52
|
+
elsif blobstring.match(/(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Nov(?:ember)?|Dec(?:ember)?) *(?:(?:[0-3]?[0-9])(?:st|nd|rd|th)?)?,? *(?:19|20)\d{2}/i)
|
53
|
+
save = Regexp.last_match.to_s
|
54
|
+
addItem(Date.parse(save).to_s, append, title, description, blobstring, save)
|
55
|
+
|
56
|
+
# Matches: yyyy
|
57
|
+
# Must start and end with word boundry, year must start with 19 or 20 and be 4 numbers
|
58
|
+
elsif blobstring.match(/\b(?:19|20)\d{2}\b/)
|
59
|
+
save = Regexp.last_match.to_s
|
60
|
+
addItem(Date.parse(Date.new(save.to_i).to_s), append, title, description, blobstring, save)
|
54
61
|
end
|
62
|
+
|
55
63
|
rescue
|
56
64
|
end
|
57
65
|
end
|
58
66
|
|
59
67
|
# Adds and item to the hash
|
60
|
-
def addItem(date,
|
68
|
+
def addItem(date, append, title, description, blob, regex)
|
61
69
|
shash = Hash.new
|
62
|
-
shash[:
|
63
|
-
shash[:
|
64
|
-
shash[:
|
65
|
-
|
70
|
+
shash[:parsed_date] = date
|
71
|
+
shash[:raw_date] = regex
|
72
|
+
shash[:short_chunk] = title
|
73
|
+
|
74
|
+
# Append fields specified
|
75
|
+
unless append == {nil=>nil}
|
76
|
+
append.each do |k, v|
|
77
|
+
shash[k] = v
|
78
|
+
end
|
79
|
+
end
|
66
80
|
|
67
81
|
flag = 0
|
68
82
|
@output.each do |o|
|
69
|
-
if (o[:
|
83
|
+
if (o[:parsed_date] == shash[:parsed_date]) && (o[:short_chunk].to_s == shash[:short_chunk].to_s)
|
70
84
|
flag = 1
|
71
85
|
break
|
72
86
|
end
|
@@ -77,7 +91,7 @@ class ExtractDates
|
|
77
91
|
end
|
78
92
|
|
79
93
|
blob.slice! regex
|
80
|
-
dateExtract(blob,
|
94
|
+
dateExtract(blob, append, title, description)
|
81
95
|
end
|
82
96
|
end
|
83
97
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: entityextractor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.14
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- M. C. McGrath
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-08-
|
11
|
+
date: 2014-08-22 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: Extracts entities and terms from any JSON.
|
14
14
|
email: shidash@shidash.com
|