entityextractor 0.0.6 → 0.0.7

Sign up to get free protection for your applications and to get access to all the features.
Files changed (2) hide show
  1. data/lib/extractdates.rb +50 -25
  2. metadata +2 -2
data/lib/extractdates.rb CHANGED
@@ -16,16 +16,8 @@ class ExtractDates
16
16
  c = @text.chunk
17
17
  c.each do |i|
18
18
  s = paragraph(i).segment
19
- dateExtract(s)
20
19
  s.each do |j|
21
- shash = Hash.new
22
- shash[:date] = dateExtract(j)
23
- if shash[:date]
24
- shash[:file] = file
25
- shash[:title] = j
26
- shash[:description] = i
27
- @output.push(shash)
28
- end
20
+ dateExtract(j, file, j, i)
29
21
  end
30
22
  end
31
23
  rescue
@@ -34,25 +26,58 @@ class ExtractDates
34
26
  return @output
35
27
  end
36
28
 
37
- def dateExtract(blob)
38
- # Date formats-
39
- # mm/dd/yy
40
- # mm/dd/yyyy
41
- # Month dd, yyyy
42
- # Month ddth, yyyy
43
- # Month yyyy
44
-
45
- # TOADD:
46
- # Multiple dates
47
- # Year detection
48
- # Conditional American dates
49
- # Time ranges
50
- # Filtering
51
-
29
+ def dateExtract(blob, file, title, description)
30
+ blobstring = blob.to_s
31
+
52
32
  begin
53
- return DateTime.parse(blob).to_s
33
+ if blobstring.match(/(\d{1,2})\/(\d{1,2})\/(\d{2,4})/)
34
+ save = blobstring.match(/(\d{1,2})\/(\d{1,2})\/(\d{2,4})/)
35
+ addItem(DateTime.parse(blob).to_s, file, title, description, blobstring, save.to_s)
36
+ elsif blobstring.match(/(\d{1,2})-(\d{1,2})-(\d{2,4})/)
37
+ save = blobstring.match(/(\d{1,2})-(\d{1,2})-(\d{2,4})/)
38
+ addItem(DateTime.parse(blob).to_s, file, title, description, blobstring, save.to_s)
39
+ elsif blobstring.match(/(.+?)(\w+ \d{1,2}(st|nd|rd|th|), \d{4})/)
40
+ save = blobstring.match(/(.+?)(\w+ \d{1,2}(st|nd|rd|th|), \d{4})/)
41
+ addItem(DateTime.parse(blob).to_s, file, title, description, blobstring, save.to_s)
42
+ elsif blobstring.match(/(.+?) ((?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Nov(?:ember)?|Dec(?:ember)?) \d{2}(st|nd|rd|th|)( |\)|\]))/)
43
+ save = blobstring.match(/(.+?) ((?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Nov(?:ember)?|Dec(?:ember)?) \d{2}(st|nd|rd|th|)( |\)|\]))/)
44
+ addItem(DateTime.parse(blob).to_s, file, title, description, blobstring, save.to_s)
45
+ elsif blobstring.match(/((?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Nov(?:ember)?|Dec(?:ember)?) [1-2][0,9]\d{2}( |\)|\]))/)
46
+ save = blobstring.match(/((?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Nov(?:ember)?|Dec(?:ember)?) [1-2][0,9]\d{2}( |\)|\]))/)
47
+ addItem(DateTime.parse(blob).to_s, file, title, description, blobstring, save.to_s)
48
+ elsif blobstring.match(/(\d{4})-(\d{2})-(\d{2})/)
49
+ save = blobstring.match(/(\d{4})-(\d{2})-(\d{2})/)
50
+ addItem(DateTime.parse(blob).to_s, file, title, description, blobstring, save.to_s)
51
+ elsif blobstring.match(/(\d{4})\/(\d{2})\/(\d{2})/)
52
+ save = blobstring.match(/(\d{4})\/(\d{2})\/(\d{2})/)
53
+ addItem(DateTime.parse(blob).to_s, file, title, description, blobstring, save.to_s)
54
+ end
54
55
  rescue
55
56
  end
56
57
  end
58
+
59
+ # Adds and item to the hash
60
+ def addItem(date, file, title, description, blob, regex)
61
+ shash = Hash.new
62
+ shash[:date] = date
63
+ shash[:file] = file
64
+ shash[:title] = title
65
+ shash[:description] = description
66
+
67
+ flag = 0
68
+ @output.each do |o|
69
+ if (o[:date] == shash[:date]) && (o[:file] == shash[:file]) && (o[:title].to_s == shash[:title].to_s)
70
+ flag = 1
71
+ break
72
+ end
73
+ end
74
+
75
+ if flag == 0
76
+ @output.push(shash)
77
+ end
78
+
79
+ blob.slice! regex
80
+ dateExtract(blob, file, title, description)
81
+ end
57
82
  end
58
83
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: entityextractor
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.6
4
+ version: 0.0.7
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2014-03-16 00:00:00.000000000 Z
12
+ date: 2014-03-18 00:00:00.000000000 Z
13
13
  dependencies: []
14
14
  description: Extracts entities and terms from any JSON.
15
15
  email: shidash@shidash.com