entityextractor 0.0.5 → 0.0.6
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/entityextractor.rb +10 -0
- data/lib/extractdates.rb +58 -0
- metadata +3 -2
data/lib/entityextractor.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
require 'json'
|
2
|
+
load 'extractdates.rb'
|
2
3
|
|
3
4
|
class EntityExtractor
|
4
5
|
def initialize(input, *extractfield)
|
@@ -109,6 +110,14 @@ class EntityExtractor
|
|
109
110
|
i["extract"] = addlist
|
110
111
|
@output.push(i)
|
111
112
|
|
113
|
+
# Extract dates
|
114
|
+
elsif type == "date"
|
115
|
+
@extractfield.each do |f|
|
116
|
+
d = ExtractDates.new(i[f])
|
117
|
+
outhash = d.chunk(i["path"])
|
118
|
+
@output.push(outhash)
|
119
|
+
end
|
120
|
+
|
112
121
|
# Extract both set terms and ALLCAPS
|
113
122
|
elsif type == "both"
|
114
123
|
@extractfield.each do |f|
|
@@ -123,3 +132,4 @@ class EntityExtractor
|
|
123
132
|
end
|
124
133
|
end
|
125
134
|
end
|
135
|
+
|
data/lib/extractdates.rb
ADDED
@@ -0,0 +1,58 @@
|
|
1
|
+
require 'treat'
|
2
|
+
include Treat::Core::DSL
|
3
|
+
require 'date'
|
4
|
+
require 'json'
|
5
|
+
require 'american_date'
|
6
|
+
|
7
|
+
class ExtractDates
|
8
|
+
def initialize(text)
|
9
|
+
@text = text
|
10
|
+
@output = Array.new
|
11
|
+
end
|
12
|
+
|
13
|
+
def chunk(file)
|
14
|
+
if @text
|
15
|
+
begin
|
16
|
+
c = @text.chunk
|
17
|
+
c.each do |i|
|
18
|
+
s = paragraph(i).segment
|
19
|
+
dateExtract(s)
|
20
|
+
s.each do |j|
|
21
|
+
shash = Hash.new
|
22
|
+
shash[:date] = dateExtract(j)
|
23
|
+
if shash[:date]
|
24
|
+
shash[:file] = file
|
25
|
+
shash[:title] = j
|
26
|
+
shash[:description] = i
|
27
|
+
@output.push(shash)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
rescue
|
32
|
+
end
|
33
|
+
end
|
34
|
+
return @output
|
35
|
+
end
|
36
|
+
|
37
|
+
def dateExtract(blob)
|
38
|
+
# Date formats-
|
39
|
+
# mm/dd/yy
|
40
|
+
# mm/dd/yyyy
|
41
|
+
# Month dd, yyyy
|
42
|
+
# Month ddth, yyyy
|
43
|
+
# Month yyyy
|
44
|
+
|
45
|
+
# TOADD:
|
46
|
+
# Multiple dates
|
47
|
+
# Year detection
|
48
|
+
# Conditional American dates
|
49
|
+
# Time ranges
|
50
|
+
# Filtering
|
51
|
+
|
52
|
+
begin
|
53
|
+
return DateTime.parse(blob).to_s
|
54
|
+
rescue
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: entityextractor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.6
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2014-03-
|
12
|
+
date: 2014-03-16 00:00:00.000000000 Z
|
13
13
|
dependencies: []
|
14
14
|
description: Extracts entities and terms from any JSON.
|
15
15
|
email: shidash@shidash.com
|
@@ -17,6 +17,7 @@ executables: []
|
|
17
17
|
extensions: []
|
18
18
|
extra_rdoc_files: []
|
19
19
|
files:
|
20
|
+
- lib/extractdates.rb
|
20
21
|
- lib/entityextractor.rb
|
21
22
|
homepage: https://github.com/Shidash/EntityExtractor
|
22
23
|
licenses:
|