RubyGems - entityextractor - Versions diffs - 0.0.13 → 0.0.14 - Mend

entityextractor 0.0.13 → 0.0.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 1c30ad9e4255be56330f8e5b3a60a264ea130fce
-  data.tar.gz: 4d1b4ddafa8fa612069652e7998891e6e4eeb82e
+  metadata.gz: b3393233d6b5a8717d256f130c176b16bd2ed185
+  data.tar.gz: 546527bb55ee6d4af863236938460fa66aa4384a
 SHA512:
-  metadata.gz: 438aafd665e8f4920ef2565e3e66088cc25c8166513c08226f2bac4e2995825b5f30c3e7575712008c89051e0135169b803316a2fa442a5c5053743054dc0530
-  data.tar.gz: 47e08bd63255ad299b051da2cf23c2cef22bb9db1c21ec8d09b1e6931120a283c0873ffa260e73471da4924ab773ed2e488bdc010048a3c49a85d8be42acf0b9
+  metadata.gz: 20aad2d3d3d0b63dbad195ae96e31b6569a624cc6e7da680a77a81510cfb59c3af97d61d813878a38f8bedaf02335b3712963c2c68d3c75dc01b73c439fa3058
+  data.tar.gz: 4b33fca6dc496a4f456928ff336a6ad16554e347f0702d22a72f45b11a021236f61c5e39ae438dbcf83885f935a6555b18a85ab2df7f3467d4698b754c181b40

data/lib/entityextractor.rb CHANGED Viewed

@@ -96,7 +96,7 @@ class EntityExtractor
     JSON.pretty_generate(@output)
   end
-  def extract(type, minchar, ignoreterms, terms, ignorefields, caseinfo, mapto)
+  def extract(type, minchar, ignoreterms, terms, ignorefields, caseinfo, mapto, *append)
     flag = 0
     h = HandleInput.new(terms, ignorefields, caseinfo)
@@ -137,8 +137,14 @@ class EntityExtractor
       elsif type == "date"
         @extractfield.each do |f|
           d = ExtractDates.new(i[f])
-          outhash = d.chunk(i["path"])
-          @output.push(outhash)
+          appendhash = Hash.new
+          append.each do |a|
+            appendhash[a] = i[a]
+          end
+          outhash = d.chunk(appendhash)
+          @output.push(outhash) if !outhash.empty?
         end
       # Extract both set terms and ALLCAPS

data/lib/extractdates.rb CHANGED Viewed

@@ -10,63 +10,77 @@ class ExtractDates
     @output = Array.new
   end
-  def chunk(file)
-    if @text
-      begin
-        c = @text.chunk
-        c.each do |i|
-          s = paragraph(i).segment
-          s.each do |j|
-            dateExtract(j, file, j, i)
-          end
-        end
-      rescue
+  def chunk(append)
+    if !@text.empty?
+      i = @text
+      s = paragraph(i).segment
+      s.each do |j|
+        dateExtract(j, append, j, i)
       end
     end
     return @output
   end
-  def dateExtract(blob, file, title, description)
+  # Finds matches for date formats in the blob from chunk(append)
+  def dateExtract(blob, append, title, description)
     blobstring = blob.to_s
     begin
-      if blobstring.match(/(\d{1,2})\/(\d{1,2})\/(\d{2,4})/)
-        save = blobstring.match(/(\d{1,2})\/(\d{1,2})\/(\d{2,4})/)
-        addItem(DateTime.parse(blob).to_s, file, title, description, blobstring, save.to_s)
-      elsif blobstring.match(/(\d{1,2})-(\d{1,2})-(\d{2,4})/)
-        save = blobstring.match(/(\d{1,2})-(\d{1,2})-(\d{2,4})/)
-        addItem(DateTime.parse(blob).to_s, file, title, description, blobstring, save.to_s)
-      elsif blobstring.match(/(.+?)(\w+ \d{1,2}(st|nd|rd|th|), \d{4})/)
-        save = blobstring.match(/(.+?)(\w+ \d{1,2}(st|nd|rd|th|), \d{4})/)
-        addItem(DateTime.parse(blob).to_s, file, title, description, blobstring, save.to_s)
-      elsif blobstring.match(/(.+?) ((?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Nov(?:ember)?|Dec(?:ember)?) \d{2}(st|nd|rd|th|)( |\)|\]))/)
-        save = blobstring.match(/(.+?) ((?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Nov(?:ember)?|Dec(?:ember)?) \d{2}(st|nd|rd|th|)( |\)|\]))/)
-        addItem(DateTime.parse(blob).to_s, file, title, description, blobstring, save.to_s)
-      elsif blobstring.match(/((?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Nov(?:ember)?|Dec(?:ember)?) [1-2][0,9]\d{2}( |\)|\]))/)
-        save = blobstring.match(/((?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Nov(?:ember)?|Dec(?:ember)?) [1-2][0,9]\d{2}( |\)|\]))/)
-        addItem(DateTime.parse(blob).to_s, file, title, description, blobstring, save.to_s)
-      elsif blobstring.match(/(\d{4})-(\d{2})-(\d{2})/)
-        save = blobstring.match(/(\d{4})-(\d{2})-(\d{2})/)
-        addItem(DateTime.parse(blob).to_s, file, title, description, blobstring, save.to_s)
-      elsif blobstring.match(/(\d{4})\/(\d{2})\/(\d{2})/)
-        save = blobstring.match(/(\d{4})\/(\d{2})\/(\d{2})/)
-        addItem(DateTime.parse(blob).to_s, file, title, description, blobstring, save.to_s)
+      # See below, but with yyyy-mm-dd (and months can only start with 0-1
+      if blobstring.match(/(?:19|20)\d{2}(?:-|\/)[0-1]?[0-9](?:-|\/)[0-3]?[0-9]/)
+        save = Regexp.last_match.to_s
+        saveparse = save.gsub("-", "/") # Needed for american_date gem
+        addItem(Date.parse(saveparse).to_s, append, title, description, blobstring, save)
+        # mm-dd-yyyy, mm/dd/yy, and similar. m or d must start with 0-3 (optional) and end in not 0
+        # Year can only start with 19 or 20 if it is four chars, or it could be 2 char
+      elsif blobstring.match(/[0-3]?[0-9](?:-|\/)[0-3]?[0-9](?:-|\/)(?:(?:19|20)\d{2}|\d{2})/)
+        save = Regexp.last_match.to_s
+        saveparse = save.gsub("-", "/")
+        addItem(Date.parse(saveparse).to_s, append, title, description, blobstring, save)
+        # Same as below but with dd before instead of in middle and supports two digit year
+        # Matches: dd Month yyyy, ddth Month yyyy, ddmonthyy ddthmonthyyyy
+      elsif blobstring.match(/(?:(?:[0-3]?[0-9])(?:st|nd|rd|th)?) *(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Nov(?:ember)?|Dec(?:ember)?) *(?:19|20)?\d{2}/i)
+        save = Regexp.last_match.to_s
+        addItem(Date.parse(save).to_s, append, title, description, blobstring, save)
+        # Matches: Month yyyy, Month dd yyyy, Month ddth yyyy, Month dd, yyyy, Month ddth, yyyy,
+        # Month can be full month or abbreviation, optional dd with 1 optional th/st/nd/rd, yyyy starting in 19 or 20
+        # Case insensitive, optional/variable spaces
+      elsif blobstring.match(/(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Nov(?:ember)?|Dec(?:ember)?) *(?:(?:[0-3]?[0-9])(?:st|nd|rd|th)?)?,? *(?:19|20)\d{2}/i)
+        save = Regexp.last_match.to_s
+        addItem(Date.parse(save).to_s, append, title, description, blobstring, save)
+        # Matches: yyyy
+        # Must start and end with word boundry, year must start with 19 or 20 and be 4 numbers
+      elsif blobstring.match(/\b(?:19|20)\d{2}\b/)
+        save = Regexp.last_match.to_s
+        addItem(Date.parse(Date.new(save.to_i).to_s), append, title, description, blobstring, save)
       end
     rescue
     end
   end
   # Adds and item to the hash
-  def addItem(date, file, title, description, blob, regex)
+  def addItem(date, append, title, description, blob, regex)
     shash = Hash.new
-    shash[:date] = date
-    shash[:file] = file
-    shash[:title] = title
-    shash[:description] = description
+    shash[:parsed_date] = date
+    shash[:raw_date] = regex
+    shash[:short_chunk] = title
+    # Append fields specified
+    unless append == {nil=>nil}
+      append.each do |k, v|
+        shash[k] = v
+      end
+    end
     flag = 0
     @output.each do |o|
-      if (o[:date] == shash[:date]) && (o[:file] == shash[:file]) && (o[:title].to_s == shash[:title].to_s)
+      if (o[:parsed_date] == shash[:parsed_date]) && (o[:short_chunk].to_s == shash[:short_chunk].to_s)
         flag = 1
         break
       end
@@ -77,7 +91,7 @@ class ExtractDates
     end
     blob.slice! regex
-    dateExtract(blob, file, title, description)
+    dateExtract(blob, append, title, description)
   end
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: entityextractor
 version: !ruby/object:Gem::Version
-  version: 0.0.13
+  version: 0.0.14
 platform: ruby
 authors:
 - M. C. McGrath
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2014-08-12 00:00:00.000000000 Z
+date: 2014-08-22 00:00:00.000000000 Z
 dependencies: []
 description: Extracts entities and terms from any JSON.
 email: shidash@shidash.com