entityextractor 0.0.13 → 0.0.14

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 1c30ad9e4255be56330f8e5b3a60a264ea130fce
4
- data.tar.gz: 4d1b4ddafa8fa612069652e7998891e6e4eeb82e
3
+ metadata.gz: b3393233d6b5a8717d256f130c176b16bd2ed185
4
+ data.tar.gz: 546527bb55ee6d4af863236938460fa66aa4384a
5
5
  SHA512:
6
- metadata.gz: 438aafd665e8f4920ef2565e3e66088cc25c8166513c08226f2bac4e2995825b5f30c3e7575712008c89051e0135169b803316a2fa442a5c5053743054dc0530
7
- data.tar.gz: 47e08bd63255ad299b051da2cf23c2cef22bb9db1c21ec8d09b1e6931120a283c0873ffa260e73471da4924ab773ed2e488bdc010048a3c49a85d8be42acf0b9
6
+ metadata.gz: 20aad2d3d3d0b63dbad195ae96e31b6569a624cc6e7da680a77a81510cfb59c3af97d61d813878a38f8bedaf02335b3712963c2c68d3c75dc01b73c439fa3058
7
+ data.tar.gz: 4b33fca6dc496a4f456928ff336a6ad16554e347f0702d22a72f45b11a021236f61c5e39ae438dbcf83885f935a6555b18a85ab2df7f3467d4698b754c181b40
@@ -96,7 +96,7 @@ class EntityExtractor
96
96
  JSON.pretty_generate(@output)
97
97
  end
98
98
 
99
- def extract(type, minchar, ignoreterms, terms, ignorefields, caseinfo, mapto)
99
+ def extract(type, minchar, ignoreterms, terms, ignorefields, caseinfo, mapto, *append)
100
100
  flag = 0
101
101
 
102
102
  h = HandleInput.new(terms, ignorefields, caseinfo)
@@ -137,8 +137,14 @@ class EntityExtractor
137
137
  elsif type == "date"
138
138
  @extractfield.each do |f|
139
139
  d = ExtractDates.new(i[f])
140
- outhash = d.chunk(i["path"])
141
- @output.push(outhash)
140
+
141
+ appendhash = Hash.new
142
+ append.each do |a|
143
+ appendhash[a] = i[a]
144
+ end
145
+
146
+ outhash = d.chunk(appendhash)
147
+ @output.push(outhash) if !outhash.empty?
142
148
  end
143
149
 
144
150
  # Extract both set terms and ALLCAPS
data/lib/extractdates.rb CHANGED
@@ -10,63 +10,77 @@ class ExtractDates
10
10
  @output = Array.new
11
11
  end
12
12
 
13
- def chunk(file)
14
- if @text
15
- begin
16
- c = @text.chunk
17
- c.each do |i|
18
- s = paragraph(i).segment
19
- s.each do |j|
20
- dateExtract(j, file, j, i)
21
- end
22
- end
23
- rescue
13
+ def chunk(append)
14
+ if !@text.empty?
15
+ i = @text
16
+ s = paragraph(i).segment
17
+ s.each do |j|
18
+ dateExtract(j, append, j, i)
24
19
  end
25
20
  end
21
+
26
22
  return @output
27
23
  end
28
24
 
29
- def dateExtract(blob, file, title, description)
25
+ # Finds matches for date formats in the blob from chunk(append)
26
+ def dateExtract(blob, append, title, description)
30
27
  blobstring = blob.to_s
31
28
 
32
29
  begin
33
- if blobstring.match(/(\d{1,2})\/(\d{1,2})\/(\d{2,4})/)
34
- save = blobstring.match(/(\d{1,2})\/(\d{1,2})\/(\d{2,4})/)
35
- addItem(DateTime.parse(blob).to_s, file, title, description, blobstring, save.to_s)
36
- elsif blobstring.match(/(\d{1,2})-(\d{1,2})-(\d{2,4})/)
37
- save = blobstring.match(/(\d{1,2})-(\d{1,2})-(\d{2,4})/)
38
- addItem(DateTime.parse(blob).to_s, file, title, description, blobstring, save.to_s)
39
- elsif blobstring.match(/(.+?)(\w+ \d{1,2}(st|nd|rd|th|), \d{4})/)
40
- save = blobstring.match(/(.+?)(\w+ \d{1,2}(st|nd|rd|th|), \d{4})/)
41
- addItem(DateTime.parse(blob).to_s, file, title, description, blobstring, save.to_s)
42
- elsif blobstring.match(/(.+?) ((?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Nov(?:ember)?|Dec(?:ember)?) \d{2}(st|nd|rd|th|)( |\)|\]))/)
43
- save = blobstring.match(/(.+?) ((?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Nov(?:ember)?|Dec(?:ember)?) \d{2}(st|nd|rd|th|)( |\)|\]))/)
44
- addItem(DateTime.parse(blob).to_s, file, title, description, blobstring, save.to_s)
45
- elsif blobstring.match(/((?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Nov(?:ember)?|Dec(?:ember)?) [1-2][0,9]\d{2}( |\)|\]))/)
46
- save = blobstring.match(/((?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Nov(?:ember)?|Dec(?:ember)?) [1-2][0,9]\d{2}( |\)|\]))/)
47
- addItem(DateTime.parse(blob).to_s, file, title, description, blobstring, save.to_s)
48
- elsif blobstring.match(/(\d{4})-(\d{2})-(\d{2})/)
49
- save = blobstring.match(/(\d{4})-(\d{2})-(\d{2})/)
50
- addItem(DateTime.parse(blob).to_s, file, title, description, blobstring, save.to_s)
51
- elsif blobstring.match(/(\d{4})\/(\d{2})\/(\d{2})/)
52
- save = blobstring.match(/(\d{4})\/(\d{2})\/(\d{2})/)
53
- addItem(DateTime.parse(blob).to_s, file, title, description, blobstring, save.to_s)
30
+ # See below, but with yyyy-mm-dd (and months can only start with 0-1
31
+ if blobstring.match(/(?:19|20)\d{2}(?:-|\/)[0-1]?[0-9](?:-|\/)[0-3]?[0-9]/)
32
+ save = Regexp.last_match.to_s
33
+ saveparse = save.gsub("-", "/") # Needed for american_date gem
34
+ addItem(Date.parse(saveparse).to_s, append, title, description, blobstring, save)
35
+
36
+ # mm-dd-yyyy, mm/dd/yy, and similar. m or d must start with 0-3 (optional) and end in not 0
37
+ # Year can only start with 19 or 20 if it is four chars, or it could be 2 char
38
+ elsif blobstring.match(/[0-3]?[0-9](?:-|\/)[0-3]?[0-9](?:-|\/)(?:(?:19|20)\d{2}|\d{2})/)
39
+ save = Regexp.last_match.to_s
40
+ saveparse = save.gsub("-", "/")
41
+ addItem(Date.parse(saveparse).to_s, append, title, description, blobstring, save)
42
+
43
+ # Same as below but with dd before instead of in middle and supports two digit year
44
+ # Matches: dd Month yyyy, ddth Month yyyy, ddmonthyy ddthmonthyyyy
45
+ elsif blobstring.match(/(?:(?:[0-3]?[0-9])(?:st|nd|rd|th)?) *(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Nov(?:ember)?|Dec(?:ember)?) *(?:19|20)?\d{2}/i)
46
+ save = Regexp.last_match.to_s
47
+ addItem(Date.parse(save).to_s, append, title, description, blobstring, save)
48
+
49
+ # Matches: Month yyyy, Month dd yyyy, Month ddth yyyy, Month dd, yyyy, Month ddth, yyyy,
50
+ # Month can be full month or abbreviation, optional dd with 1 optional th/st/nd/rd, yyyy starting in 19 or 20
51
+ # Case insensitive, optional/variable spaces
52
+ elsif blobstring.match(/(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Nov(?:ember)?|Dec(?:ember)?) *(?:(?:[0-3]?[0-9])(?:st|nd|rd|th)?)?,? *(?:19|20)\d{2}/i)
53
+ save = Regexp.last_match.to_s
54
+ addItem(Date.parse(save).to_s, append, title, description, blobstring, save)
55
+
56
+ # Matches: yyyy
57
+ # Must start and end with word boundry, year must start with 19 or 20 and be 4 numbers
58
+ elsif blobstring.match(/\b(?:19|20)\d{2}\b/)
59
+ save = Regexp.last_match.to_s
60
+ addItem(Date.parse(Date.new(save.to_i).to_s), append, title, description, blobstring, save)
54
61
  end
62
+
55
63
  rescue
56
64
  end
57
65
  end
58
66
 
59
67
  # Adds and item to the hash
60
- def addItem(date, file, title, description, blob, regex)
68
+ def addItem(date, append, title, description, blob, regex)
61
69
  shash = Hash.new
62
- shash[:date] = date
63
- shash[:file] = file
64
- shash[:title] = title
65
- shash[:description] = description
70
+ shash[:parsed_date] = date
71
+ shash[:raw_date] = regex
72
+ shash[:short_chunk] = title
73
+
74
+ # Append fields specified
75
+ unless append == {nil=>nil}
76
+ append.each do |k, v|
77
+ shash[k] = v
78
+ end
79
+ end
66
80
 
67
81
  flag = 0
68
82
  @output.each do |o|
69
- if (o[:date] == shash[:date]) && (o[:file] == shash[:file]) && (o[:title].to_s == shash[:title].to_s)
83
+ if (o[:parsed_date] == shash[:parsed_date]) && (o[:short_chunk].to_s == shash[:short_chunk].to_s)
70
84
  flag = 1
71
85
  break
72
86
  end
@@ -77,7 +91,7 @@ class ExtractDates
77
91
  end
78
92
 
79
93
  blob.slice! regex
80
- dateExtract(blob, file, title, description)
94
+ dateExtract(blob, append, title, description)
81
95
  end
82
96
  end
83
97
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: entityextractor
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.13
4
+ version: 0.0.14
5
5
  platform: ruby
6
6
  authors:
7
7
  - M. C. McGrath
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-08-12 00:00:00.000000000 Z
11
+ date: 2014-08-22 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Extracts entities and terms from any JSON.
14
14
  email: shidash@shidash.com