entityextractor 0.0.6 → 0.0.7
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/extractdates.rb +50 -25
- metadata +2 -2
data/lib/extractdates.rb
CHANGED
@@ -16,16 +16,8 @@ class ExtractDates
|
|
16
16
|
c = @text.chunk
|
17
17
|
c.each do |i|
|
18
18
|
s = paragraph(i).segment
|
19
|
-
dateExtract(s)
|
20
19
|
s.each do |j|
|
21
|
-
|
22
|
-
shash[:date] = dateExtract(j)
|
23
|
-
if shash[:date]
|
24
|
-
shash[:file] = file
|
25
|
-
shash[:title] = j
|
26
|
-
shash[:description] = i
|
27
|
-
@output.push(shash)
|
28
|
-
end
|
20
|
+
dateExtract(j, file, j, i)
|
29
21
|
end
|
30
22
|
end
|
31
23
|
rescue
|
@@ -34,25 +26,58 @@ class ExtractDates
|
|
34
26
|
return @output
|
35
27
|
end
|
36
28
|
|
37
|
-
def dateExtract(blob)
|
38
|
-
|
39
|
-
|
40
|
-
# mm/dd/yyyy
|
41
|
-
# Month dd, yyyy
|
42
|
-
# Month ddth, yyyy
|
43
|
-
# Month yyyy
|
44
|
-
|
45
|
-
# TOADD:
|
46
|
-
# Multiple dates
|
47
|
-
# Year detection
|
48
|
-
# Conditional American dates
|
49
|
-
# Time ranges
|
50
|
-
# Filtering
|
51
|
-
|
29
|
+
def dateExtract(blob, file, title, description)
|
30
|
+
blobstring = blob.to_s
|
31
|
+
|
52
32
|
begin
|
53
|
-
|
33
|
+
if blobstring.match(/(\d{1,2})\/(\d{1,2})\/(\d{2,4})/)
|
34
|
+
save = blobstring.match(/(\d{1,2})\/(\d{1,2})\/(\d{2,4})/)
|
35
|
+
addItem(DateTime.parse(blob).to_s, file, title, description, blobstring, save.to_s)
|
36
|
+
elsif blobstring.match(/(\d{1,2})-(\d{1,2})-(\d{2,4})/)
|
37
|
+
save = blobstring.match(/(\d{1,2})-(\d{1,2})-(\d{2,4})/)
|
38
|
+
addItem(DateTime.parse(blob).to_s, file, title, description, blobstring, save.to_s)
|
39
|
+
elsif blobstring.match(/(.+?)(\w+ \d{1,2}(st|nd|rd|th|), \d{4})/)
|
40
|
+
save = blobstring.match(/(.+?)(\w+ \d{1,2}(st|nd|rd|th|), \d{4})/)
|
41
|
+
addItem(DateTime.parse(blob).to_s, file, title, description, blobstring, save.to_s)
|
42
|
+
elsif blobstring.match(/(.+?) ((?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Nov(?:ember)?|Dec(?:ember)?) \d{2}(st|nd|rd|th|)( |\)|\]))/)
|
43
|
+
save = blobstring.match(/(.+?) ((?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Nov(?:ember)?|Dec(?:ember)?) \d{2}(st|nd|rd|th|)( |\)|\]))/)
|
44
|
+
addItem(DateTime.parse(blob).to_s, file, title, description, blobstring, save.to_s)
|
45
|
+
elsif blobstring.match(/((?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Nov(?:ember)?|Dec(?:ember)?) [1-2][0,9]\d{2}( |\)|\]))/)
|
46
|
+
save = blobstring.match(/((?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Nov(?:ember)?|Dec(?:ember)?) [1-2][0,9]\d{2}( |\)|\]))/)
|
47
|
+
addItem(DateTime.parse(blob).to_s, file, title, description, blobstring, save.to_s)
|
48
|
+
elsif blobstring.match(/(\d{4})-(\d{2})-(\d{2})/)
|
49
|
+
save = blobstring.match(/(\d{4})-(\d{2})-(\d{2})/)
|
50
|
+
addItem(DateTime.parse(blob).to_s, file, title, description, blobstring, save.to_s)
|
51
|
+
elsif blobstring.match(/(\d{4})\/(\d{2})\/(\d{2})/)
|
52
|
+
save = blobstring.match(/(\d{4})\/(\d{2})\/(\d{2})/)
|
53
|
+
addItem(DateTime.parse(blob).to_s, file, title, description, blobstring, save.to_s)
|
54
|
+
end
|
54
55
|
rescue
|
55
56
|
end
|
56
57
|
end
|
58
|
+
|
59
|
+
# Adds and item to the hash
|
60
|
+
def addItem(date, file, title, description, blob, regex)
|
61
|
+
shash = Hash.new
|
62
|
+
shash[:date] = date
|
63
|
+
shash[:file] = file
|
64
|
+
shash[:title] = title
|
65
|
+
shash[:description] = description
|
66
|
+
|
67
|
+
flag = 0
|
68
|
+
@output.each do |o|
|
69
|
+
if (o[:date] == shash[:date]) && (o[:file] == shash[:file]) && (o[:title].to_s == shash[:title].to_s)
|
70
|
+
flag = 1
|
71
|
+
break
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
if flag == 0
|
76
|
+
@output.push(shash)
|
77
|
+
end
|
78
|
+
|
79
|
+
blob.slice! regex
|
80
|
+
dateExtract(blob, file, title, description)
|
81
|
+
end
|
57
82
|
end
|
58
83
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: entityextractor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.7
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2014-03-
|
12
|
+
date: 2014-03-18 00:00:00.000000000 Z
|
13
13
|
dependencies: []
|
14
14
|
description: Extracts entities and terms from any JSON.
|
15
15
|
email: shidash@shidash.com
|