entityextractor 0.0.5 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/entityextractor.rb +10 -0
- data/lib/extractdates.rb +58 -0
- metadata +3 -2
data/lib/entityextractor.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
require 'json'
|
2
|
+
load 'extractdates.rb'
|
2
3
|
|
3
4
|
class EntityExtractor
|
4
5
|
def initialize(input, *extractfield)
|
@@ -109,6 +110,14 @@ class EntityExtractor
|
|
109
110
|
i["extract"] = addlist
|
110
111
|
@output.push(i)
|
111
112
|
|
113
|
+
# Extract dates
|
114
|
+
elsif type == "date"
|
115
|
+
@extractfield.each do |f|
|
116
|
+
d = ExtractDates.new(i[f])
|
117
|
+
outhash = d.chunk(i["path"])
|
118
|
+
@output.push(outhash)
|
119
|
+
end
|
120
|
+
|
112
121
|
# Extract both set terms and ALLCAPS
|
113
122
|
elsif type == "both"
|
114
123
|
@extractfield.each do |f|
|
@@ -123,3 +132,4 @@ class EntityExtractor
|
|
123
132
|
end
|
124
133
|
end
|
125
134
|
end
|
135
|
+
|
data/lib/extractdates.rb
ADDED
@@ -0,0 +1,58 @@
|
|
1
|
+
require 'treat'
|
2
|
+
include Treat::Core::DSL
|
3
|
+
require 'date'
|
4
|
+
require 'json'
|
5
|
+
require 'american_date'
|
6
|
+
|
7
|
+
class ExtractDates
|
8
|
+
def initialize(text)
|
9
|
+
@text = text
|
10
|
+
@output = Array.new
|
11
|
+
end
|
12
|
+
|
13
|
+
def chunk(file)
|
14
|
+
if @text
|
15
|
+
begin
|
16
|
+
c = @text.chunk
|
17
|
+
c.each do |i|
|
18
|
+
s = paragraph(i).segment
|
19
|
+
dateExtract(s)
|
20
|
+
s.each do |j|
|
21
|
+
shash = Hash.new
|
22
|
+
shash[:date] = dateExtract(j)
|
23
|
+
if shash[:date]
|
24
|
+
shash[:file] = file
|
25
|
+
shash[:title] = j
|
26
|
+
shash[:description] = i
|
27
|
+
@output.push(shash)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
rescue
|
32
|
+
end
|
33
|
+
end
|
34
|
+
return @output
|
35
|
+
end
|
36
|
+
|
37
|
+
def dateExtract(blob)
|
38
|
+
# Date formats-
|
39
|
+
# mm/dd/yy
|
40
|
+
# mm/dd/yyyy
|
41
|
+
# Month dd, yyyy
|
42
|
+
# Month ddth, yyyy
|
43
|
+
# Month yyyy
|
44
|
+
|
45
|
+
# TOADD:
|
46
|
+
# Multiple dates
|
47
|
+
# Year detection
|
48
|
+
# Conditional American dates
|
49
|
+
# Time ranges
|
50
|
+
# Filtering
|
51
|
+
|
52
|
+
begin
|
53
|
+
return DateTime.parse(blob).to_s
|
54
|
+
rescue
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: entityextractor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.6
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2014-03-
|
12
|
+
date: 2014-03-16 00:00:00.000000000 Z
|
13
13
|
dependencies: []
|
14
14
|
description: Extracts entities and terms from any JSON.
|
15
15
|
email: shidash@shidash.com
|
@@ -17,6 +17,7 @@ executables: []
|
|
17
17
|
extensions: []
|
18
18
|
extra_rdoc_files: []
|
19
19
|
files:
|
20
|
+
- lib/extractdates.rb
|
20
21
|
- lib/entityextractor.rb
|
21
22
|
homepage: https://github.com/Shidash/EntityExtractor
|
22
23
|
licenses:
|