entityextractor 0.0.5 → 0.0.6

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,4 +1,5 @@
1
1
  require 'json'
2
+ load 'extractdates.rb'
2
3
 
3
4
  class EntityExtractor
4
5
  def initialize(input, *extractfield)
@@ -109,6 +110,14 @@ class EntityExtractor
109
110
  i["extract"] = addlist
110
111
  @output.push(i)
111
112
 
113
+ # Extract dates
114
+ elsif type == "date"
115
+ @extractfield.each do |f|
116
+ d = ExtractDates.new(i[f])
117
+ outhash = d.chunk(i["path"])
118
+ @output.push(outhash)
119
+ end
120
+
112
121
  # Extract both set terms and ALLCAPS
113
122
  elsif type == "both"
114
123
  @extractfield.each do |f|
@@ -123,3 +132,4 @@ class EntityExtractor
123
132
  end
124
133
  end
125
134
  end
135
+
@@ -0,0 +1,58 @@
1
+ require 'treat'
2
+ include Treat::Core::DSL
3
+ require 'date'
4
+ require 'json'
5
+ require 'american_date'
6
+
7
+ class ExtractDates
8
+ def initialize(text)
9
+ @text = text
10
+ @output = Array.new
11
+ end
12
+
13
+ def chunk(file)
14
+ if @text
15
+ begin
16
+ c = @text.chunk
17
+ c.each do |i|
18
+ s = paragraph(i).segment
19
+ dateExtract(s)
20
+ s.each do |j|
21
+ shash = Hash.new
22
+ shash[:date] = dateExtract(j)
23
+ if shash[:date]
24
+ shash[:file] = file
25
+ shash[:title] = j
26
+ shash[:description] = i
27
+ @output.push(shash)
28
+ end
29
+ end
30
+ end
31
+ rescue
32
+ end
33
+ end
34
+ return @output
35
+ end
36
+
37
+ def dateExtract(blob)
38
+ # Date formats-
39
+ # mm/dd/yy
40
+ # mm/dd/yyyy
41
+ # Month dd, yyyy
42
+ # Month ddth, yyyy
43
+ # Month yyyy
44
+
45
+ # TOADD:
46
+ # Multiple dates
47
+ # Year detection
48
+ # Conditional American dates
49
+ # Time ranges
50
+ # Filtering
51
+
52
+ begin
53
+ return DateTime.parse(blob).to_s
54
+ rescue
55
+ end
56
+ end
57
+ end
58
+
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: entityextractor
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.5
4
+ version: 0.0.6
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2014-03-01 00:00:00.000000000 Z
12
+ date: 2014-03-16 00:00:00.000000000 Z
13
13
  dependencies: []
14
14
  description: Extracts entities and terms from any JSON.
15
15
  email: shidash@shidash.com
@@ -17,6 +17,7 @@ executables: []
17
17
  extensions: []
18
18
  extra_rdoc_files: []
19
19
  files:
20
+ - lib/extractdates.rb
20
21
  - lib/entityextractor.rb
21
22
  homepage: https://github.com/Shidash/EntityExtractor
22
23
  licenses: