entityextractor 0.0.13 → 0.0.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 1c30ad9e4255be56330f8e5b3a60a264ea130fce
4
- data.tar.gz: 4d1b4ddafa8fa612069652e7998891e6e4eeb82e
3
+ metadata.gz: b3393233d6b5a8717d256f130c176b16bd2ed185
4
+ data.tar.gz: 546527bb55ee6d4af863236938460fa66aa4384a
5
5
  SHA512:
6
- metadata.gz: 438aafd665e8f4920ef2565e3e66088cc25c8166513c08226f2bac4e2995825b5f30c3e7575712008c89051e0135169b803316a2fa442a5c5053743054dc0530
7
- data.tar.gz: 47e08bd63255ad299b051da2cf23c2cef22bb9db1c21ec8d09b1e6931120a283c0873ffa260e73471da4924ab773ed2e488bdc010048a3c49a85d8be42acf0b9
6
+ metadata.gz: 20aad2d3d3d0b63dbad195ae96e31b6569a624cc6e7da680a77a81510cfb59c3af97d61d813878a38f8bedaf02335b3712963c2c68d3c75dc01b73c439fa3058
7
+ data.tar.gz: 4b33fca6dc496a4f456928ff336a6ad16554e347f0702d22a72f45b11a021236f61c5e39ae438dbcf83885f935a6555b18a85ab2df7f3467d4698b754c181b40
@@ -96,7 +96,7 @@ class EntityExtractor
96
96
  JSON.pretty_generate(@output)
97
97
  end
98
98
 
99
- def extract(type, minchar, ignoreterms, terms, ignorefields, caseinfo, mapto)
99
+ def extract(type, minchar, ignoreterms, terms, ignorefields, caseinfo, mapto, *append)
100
100
  flag = 0
101
101
 
102
102
  h = HandleInput.new(terms, ignorefields, caseinfo)
@@ -137,8 +137,14 @@ class EntityExtractor
137
137
  elsif type == "date"
138
138
  @extractfield.each do |f|
139
139
  d = ExtractDates.new(i[f])
140
- outhash = d.chunk(i["path"])
141
- @output.push(outhash)
140
+
141
+ appendhash = Hash.new
142
+ append.each do |a|
143
+ appendhash[a] = i[a]
144
+ end
145
+
146
+ outhash = d.chunk(appendhash)
147
+ @output.push(outhash) if !outhash.empty?
142
148
  end
143
149
 
144
150
  # Extract both set terms and ALLCAPS
data/lib/extractdates.rb CHANGED
@@ -10,63 +10,77 @@ class ExtractDates
10
10
  @output = Array.new
11
11
  end
12
12
 
13
- def chunk(file)
14
- if @text
15
- begin
16
- c = @text.chunk
17
- c.each do |i|
18
- s = paragraph(i).segment
19
- s.each do |j|
20
- dateExtract(j, file, j, i)
21
- end
22
- end
23
- rescue
13
+ def chunk(append)
14
+ if !@text.empty?
15
+ i = @text
16
+ s = paragraph(i).segment
17
+ s.each do |j|
18
+ dateExtract(j, append, j, i)
24
19
  end
25
20
  end
21
+
26
22
  return @output
27
23
  end
28
24
 
29
- def dateExtract(blob, file, title, description)
25
+ # Finds matches for date formats in the blob from chunk(append)
26
+ def dateExtract(blob, append, title, description)
30
27
  blobstring = blob.to_s
31
28
 
32
29
  begin
33
- if blobstring.match(/(\d{1,2})\/(\d{1,2})\/(\d{2,4})/)
34
- save = blobstring.match(/(\d{1,2})\/(\d{1,2})\/(\d{2,4})/)
35
- addItem(DateTime.parse(blob).to_s, file, title, description, blobstring, save.to_s)
36
- elsif blobstring.match(/(\d{1,2})-(\d{1,2})-(\d{2,4})/)
37
- save = blobstring.match(/(\d{1,2})-(\d{1,2})-(\d{2,4})/)
38
- addItem(DateTime.parse(blob).to_s, file, title, description, blobstring, save.to_s)
39
- elsif blobstring.match(/(.+?)(\w+ \d{1,2}(st|nd|rd|th|), \d{4})/)
40
- save = blobstring.match(/(.+?)(\w+ \d{1,2}(st|nd|rd|th|), \d{4})/)
41
- addItem(DateTime.parse(blob).to_s, file, title, description, blobstring, save.to_s)
42
- elsif blobstring.match(/(.+?) ((?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Nov(?:ember)?|Dec(?:ember)?) \d{2}(st|nd|rd|th|)( |\)|\]))/)
43
- save = blobstring.match(/(.+?) ((?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Nov(?:ember)?|Dec(?:ember)?) \d{2}(st|nd|rd|th|)( |\)|\]))/)
44
- addItem(DateTime.parse(blob).to_s, file, title, description, blobstring, save.to_s)
45
- elsif blobstring.match(/((?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Nov(?:ember)?|Dec(?:ember)?) [1-2][0,9]\d{2}( |\)|\]))/)
46
- save = blobstring.match(/((?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Nov(?:ember)?|Dec(?:ember)?) [1-2][0,9]\d{2}( |\)|\]))/)
47
- addItem(DateTime.parse(blob).to_s, file, title, description, blobstring, save.to_s)
48
- elsif blobstring.match(/(\d{4})-(\d{2})-(\d{2})/)
49
- save = blobstring.match(/(\d{4})-(\d{2})-(\d{2})/)
50
- addItem(DateTime.parse(blob).to_s, file, title, description, blobstring, save.to_s)
51
- elsif blobstring.match(/(\d{4})\/(\d{2})\/(\d{2})/)
52
- save = blobstring.match(/(\d{4})\/(\d{2})\/(\d{2})/)
53
- addItem(DateTime.parse(blob).to_s, file, title, description, blobstring, save.to_s)
30
+ # See below, but with yyyy-mm-dd (and months can only start with 0-1
31
+ if blobstring.match(/(?:19|20)\d{2}(?:-|\/)[0-1]?[0-9](?:-|\/)[0-3]?[0-9]/)
32
+ save = Regexp.last_match.to_s
33
+ saveparse = save.gsub("-", "/") # Needed for american_date gem
34
+ addItem(Date.parse(saveparse).to_s, append, title, description, blobstring, save)
35
+
36
+ # mm-dd-yyyy, mm/dd/yy, and similar. m or d must start with 0-3 (optional) and end in not 0
37
+ # Year can only start with 19 or 20 if it is four chars, or it could be 2 char
38
+ elsif blobstring.match(/[0-3]?[0-9](?:-|\/)[0-3]?[0-9](?:-|\/)(?:(?:19|20)\d{2}|\d{2})/)
39
+ save = Regexp.last_match.to_s
40
+ saveparse = save.gsub("-", "/")
41
+ addItem(Date.parse(saveparse).to_s, append, title, description, blobstring, save)
42
+
43
+ # Same as below but with dd before instead of in middle and supports two digit year
44
+ # Matches: dd Month yyyy, ddth Month yyyy, ddmonthyy ddthmonthyyyy
45
+ elsif blobstring.match(/(?:(?:[0-3]?[0-9])(?:st|nd|rd|th)?) *(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Nov(?:ember)?|Dec(?:ember)?) *(?:19|20)?\d{2}/i)
46
+ save = Regexp.last_match.to_s
47
+ addItem(Date.parse(save).to_s, append, title, description, blobstring, save)
48
+
49
+ # Matches: Month yyyy, Month dd yyyy, Month ddth yyyy, Month dd, yyyy, Month ddth, yyyy,
50
+ # Month can be full month or abbreviation, optional dd with 1 optional th/st/nd/rd, yyyy starting in 19 or 20
51
+ # Case insensitive, optional/variable spaces
52
+ elsif blobstring.match(/(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Nov(?:ember)?|Dec(?:ember)?) *(?:(?:[0-3]?[0-9])(?:st|nd|rd|th)?)?,? *(?:19|20)\d{2}/i)
53
+ save = Regexp.last_match.to_s
54
+ addItem(Date.parse(save).to_s, append, title, description, blobstring, save)
55
+
56
+ # Matches: yyyy
57
+ # Must start and end with word boundry, year must start with 19 or 20 and be 4 numbers
58
+ elsif blobstring.match(/\b(?:19|20)\d{2}\b/)
59
+ save = Regexp.last_match.to_s
60
+ addItem(Date.parse(Date.new(save.to_i).to_s), append, title, description, blobstring, save)
54
61
  end
62
+
55
63
  rescue
56
64
  end
57
65
  end
58
66
 
59
67
  # Adds and item to the hash
60
- def addItem(date, file, title, description, blob, regex)
68
+ def addItem(date, append, title, description, blob, regex)
61
69
  shash = Hash.new
62
- shash[:date] = date
63
- shash[:file] = file
64
- shash[:title] = title
65
- shash[:description] = description
70
+ shash[:parsed_date] = date
71
+ shash[:raw_date] = regex
72
+ shash[:short_chunk] = title
73
+
74
+ # Append fields specified
75
+ unless append == {nil=>nil}
76
+ append.each do |k, v|
77
+ shash[k] = v
78
+ end
79
+ end
66
80
 
67
81
  flag = 0
68
82
  @output.each do |o|
69
- if (o[:date] == shash[:date]) && (o[:file] == shash[:file]) && (o[:title].to_s == shash[:title].to_s)
83
+ if (o[:parsed_date] == shash[:parsed_date]) && (o[:short_chunk].to_s == shash[:short_chunk].to_s)
70
84
  flag = 1
71
85
  break
72
86
  end
@@ -77,7 +91,7 @@ class ExtractDates
77
91
  end
78
92
 
79
93
  blob.slice! regex
80
- dateExtract(blob, file, title, description)
94
+ dateExtract(blob, append, title, description)
81
95
  end
82
96
  end
83
97
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: entityextractor
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.13
4
+ version: 0.0.14
5
5
  platform: ruby
6
6
  authors:
7
7
  - M. C. McGrath
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-08-12 00:00:00.000000000 Z
11
+ date: 2014-08-22 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Extracts entities and terms from any JSON.
14
14
  email: shidash@shidash.com