RubyGems - entityextractor - Versions diffs - 0.0.15 → 0.0.16 - Mend

entityextractor 0.0.15 → 0.0.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 81be4e29ad4a46a33f156efb2b9f52cdc4801701
-  data.tar.gz: ec4a0dcf08239c7b5430d56b352aea1bc4a74f8f
+  metadata.gz: 258653503d54eb66621aee8e606bd808b59f8c9f
+  data.tar.gz: 464f2250a12d1a7f486f3201a8701c9cef40d205
 SHA512:
-  metadata.gz: 20fec5e9765264d945f6d5c2711cb588f4067c8bed0ea62b5b8b7af6651161c9bcfeaa276966bc0869167ca0410d10b6ec3c01a74eca431daa5049da10af2b0f
-  data.tar.gz: 929e7f0ecddb66594f7b9b80f63c424c1e7356b02d7e62b684847789a034dcf1144b08feabc5399e309b703bcc338db7fda18a61c6f1cf526d88e58c251bbf53
+  metadata.gz: e9be2c981f89562784c674497c3ce55d0123066a339bcce7b7be3afa9068b1f59222bb27c81d5e6b542f38ae524df8b66b38581ae7637e311731071bc1ecefa9
+  data.tar.gz: d31f952542ed92dfcdc6e6d02df12fd1070a2c0d6188fa7af87a32ad0e70e6b52ec37ddcdb6c01906df88d7b489c00a48ad5d0ffbc1c5e8195232c8a43b5cf5c

data/lib/entity_extractor.rb ADDED

@@ -0,0 +1,46 @@
+require 'json'
+load 'extract_set_terms.rb'
+class EntityExtractor
+  def initialize(input, extract_from, save_field)
+    @input = JSON.parse(input)
+    @extract_from = extract_from
+    @save_field = save_field
+    @output = Array.new
+  end
+  # Extracts set terms
+  def extractSetTerms(to_extract, extract_term_fields, case_sensitive)
+    @input.each do |item|
+      extract = ExtractSetTerms.new(item, @extract_from, to_extract, extract_term_fields, case_sensitive, @save_field)
+      @output.push(extract.extractTerms)
+    end
+  end
+  # Gets all results in output
+  def getAllOutput
+    JSON.pretty_generate(@output)
+  end
+  # Gets only the results for which terms were found/extracted
+  def getOnlyMatching
+    matches = @output.select { |item| !item[@save_field].empty? }
+    JSON.pretty_generate(matches)
+  end
+  # Gets a list of the extracted terms by how often they occur
+  def getTermList
+    counthash = Hash.new{0}
+    # Increments for each occurrence of term
+    @output.each do |item|
+      item[@save_field].each do |term|
+        counthash[term] += 1
+      end
+    end
+    # Return hash sorted by value
+    return Hash[counthash.sort_by { |k, v| v}]
+  end
+end

data/lib/extract_set_terms.rb ADDED

@@ -0,0 +1,92 @@
+class ExtractSetTerms
+  def initialize(item, extract_field, to_extract, extract_term_fields, case_sensitive, save_field)
+    @item = item
+    @extract_field = extract_field
+    @to_extract = JSON.parse(to_extract)
+    @extract_term_fields = extract_term_fields
+    @case_sensitive = case_sensitive
+    @extract_dict = Hash.new
+    @save_field = save_field
+    @item_out = item
+  end
+  # Gets a list of terms to extract
+  def processHashInput
+    # Go through each item then each field
+    @to_extract.each do |ex_key, ex_value|
+      ex_value.each do |ex_field, ex_term|
+        # Check if it is the right field
+        if ex_field == @extract_term_fields || @extract_term_fields.include?(ex_field)
+          # Make dictionary of terms to extract and overall mapping
+          ex_term.is_a?(Array) ? processArrayInput(ex_term, ex_key) : @extract_dict[term] = ex_key
+        end
+      end
+    end
+  end
+  # Add all items in array to dictionary of terms to extract
+  def processArrayInput(extract_arr, map_val)
+    extract_arr.each do |term|
+      map_val = term if map_val == nil
+      @extract_dict[term] = map_val
+    end
+  end
+  # Check if the term appears in the text
+  def matchTerm?(term, text, case_sensitive)
+    # Downcase term and text if not case sensitive
+    if case_sensitive == false
+      term = term.downcase
+      text = text.downcase
+    end
+    # Return if it maches
+    if text.to_s.match(/\b(#{term})\b/)
+      return true
+    end
+  end
+  # Check if item is case sensitive
+  def isCaseSensitive?(term)
+    if @case_sensitive == "case-sensitive"
+      return true
+    elsif @case_sensitive == "case-insensitive"
+      return false
+    else
+      # Handle item by item variations
+      is_case_sensitive = @to_extract[term[1]][@case_sensitive]
+      if is_case_sensitive == "Yes"
+        return true
+      else return false
+      end
+    end
+  end
+  # Process input list and go through all terms and fields
+  def extractTerms
+    # Process input list
+    @to_extract.is_a?(Hash) ? processHashInput : processArrayInput(@to_extract, nil)
+    @item_out[@save_field] = Array.new
+    # Go through each term and field to check for matches
+    @extract_dict.each do |term|
+      item_case_sensitivity = isCaseSensitive?(term)
+      @extract_field.each do |field|
+        # Add to list of terms if it matches
+        if matchTerm?(term[0], @item[field], item_case_sensitivity)
+          @item_out[@save_field].push(term[1])
+        end
+      end
+    end
+    # Deduplicate and return
+    @item_out[@save_field].uniq!
+    return @item_out
+  end
+end

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: entityextractor
 version: !ruby/object:Gem::Version
-  version: 0.0.15
+  version: 0.0.16
 platform: ruby
 authors:
 - M. C. McGrath
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2015-03-14 00:00:00.000000000 Z
+date: 2015-04-20 00:00:00.000000000 Z
 dependencies: []
 description: Extracts entities and terms from any JSON.
 email: shidash@shidash.com
@@ -16,9 +16,8 @@ executables: []
 extensions: []
 extra_rdoc_files: []
 files:
-- lib/entityextractor.rb
-- lib/extractdates.rb
-- lib/handleinput.rb
+- lib/entity_extractor.rb
+- lib/extract_set_terms.rb
 homepage: https://github.com/Shidash/EntityExtractor
 licenses:
 - GPL

data/lib/entityextractor.rb DELETED

@@ -1,173 +0,0 @@
-require 'json'
-load 'extractdates.rb'
-load 'handleinput.rb'
-require 'uploadconvert'
-class EntityExtractor
-  def initialize(input, fieldoutname, *extractfield)
-    @input = JSON.parse(input)
-    @fieldoutname = fieldoutname
-    @extractfield = *extractfield
-    @output = Array.new
-  end
-  # Extract terms input from preset list
-  def extractTerms(extractlist, i, addlist, field)
-    count = 0
-    downcased = i[field].to_s.downcase
-    # Check the item for each term
-    extractlist.each do |t, c|
-      count+=1
-      if c == true
-        if i[field].to_s.match(/\b(#{t})\b/)
-          addlist.push(t) if !addlist.include? t
-        end
-      else
-        if downcased.match(/\b(#{t.downcase})\b/)
-          addlist.push(t) if !addlist.include? t
-        end
-      end
-    end
-  end
-  # Parses terms in all caps
-  def parseALLCAPS(toParse, i, minchar, addlist, ignoreterms, savefield, extractfield)
-    if toParse =~ (/[A-Z]{#{minchar}}/)
-      index = toParse =~ (/[A-Z]{#{minchar}}/)
-      charnum = 0
-      # Find word in all caps
-      toParse.each_char do |c|
-        if charnum >= index
-          if toParse[c] == toParse[c].upcase && toParse[c] !~ (/[[:punct:]]/) && toParse[c] !~ (/[[:digit:]]/)
-            charnum += 1
-          else break
-          end
-        else
-          charnum += 1
-        end
-      end
-      # Remove any extra characters
-      if toParse[charnum-2] == " "
-        charnum = charnum-3
-      elsif toParse[charnum-1] == " "
-        charnum = charnum-2
-      else charnum = charnum-1
-      end
-      # Filter out terms in ignoreterms array
-      if !(ignoreterms.include? toParse[index..charnum])
-        addlist.push(toParse[index..charnum])
-      end
-      parsedstring = toParse[0..charnum]
-      toParse.slice! parsedstring
-      parseALLCAPS(toParse, i, minchar, addlist, ignoreterms, savefield, extractfield)
-    # If there are no (more) results, append addlist to JSON
-    else
-      i[extractfield] = savefield
-    end
-  end
-  # Get list of just extracted terms by occurrence
-  def getExtract
-    extracthash = Hash.new
-    # Generate hash of all extracted terms
-    @output.each do |i|
-      i[@fieldoutname].each do |e|
-        if extracthash.has_key? e
-          extracthash[e] += 1
-        else
-          extracthash[e] = 1
-        end
-      end
-    end
-    # Sort hash
-    return Hash[extracthash.sort_by { |k, v| v}]
-  end
-  # Generates JSON output
-  def genJSON
-    JSON.pretty_generate(@output)
-  end
-  def extract(type, minchar, ignoreterms, terms, ignorefields, caseinfo, mapto, *append)
-    flag = 0
-    h = HandleInput.new(terms, ignorefields, caseinfo)
-    extractlist = h.detecttype
-    @input.each do |i|
-      if i.length == 2
-        i = @input
-        flag = 1
-      end
-      addlist = Array.new
-      # Generate set terms list
-      if type == "set"
-        @extractfield.each do |f|
-          extractTerms(extractlist, i, addlist, f)
-        end
-        if mapto
-          i[@fieldoutname] = h.mapout(addlist, mapto)
-        else
-          i[@fieldoutname] = addlist
-        end
-        @output.push(i)
-      # Generate ALLCAPS terms list
-      elsif type == "ALLCAPS"
-        @extractfield.each do |f|
-          savefield = i[f].to_s + " "
-          parseALLCAPS(i[f].to_s, i, minchar, addlist, ignoreterms, savefield, f)
-        end
-        i[@fieldoutname] = addlist
-        @output.push(i)
-      # Extract dates
-      elsif type == "date"
-        @extractfield.each do |f|
-          d = ExtractDates.new(i[f])
-          appendhash = Hash.new
-          append.each do |a|
-            appendhash[a] = i[a]
-          end
-          outhash = d.chunk(appendhash)
-          @output.push(outhash) if !outhash.empty?
-        end
-      # Extract both set terms and ALLCAPS
-      elsif type == "both"
-        @extractfield.each do |f|
-          savefield = i[f].to_s + " "
-          parseALLCAPS(i[f].to_s, i, minchar, addlist, ignoreterms, savefield, f)
-          extractTerms(extractlist, i, addlist, f)
-        end
-        if mapto
-          i[@fieldoutname] = h.mapout(addlist, mapto)
-        else
-          i[@fieldoutname] = addlist
-        end
-        @output.push(i)
-      end
-      if flag == 1
-        break
-      end
-    end
- end
-end

data/lib/extractdates.rb DELETED

@@ -1,97 +0,0 @@
-require 'treat'
-include Treat::Core::DSL
-require 'date'
-require 'json'
-require 'american_date'
-class ExtractDates
-  def initialize(text)
-    @text = text
-    @output = Array.new
-  end
-  def chunk(append)
-    if !@text.empty?
-      i = @text
-      s = paragraph(i).segment
-      s.each do |j|
-        dateExtract(j, append, j, i)
-      end
-    end
-    return @output
-  end
-  # Finds matches for date formats in the blob from chunk(append)
-  def dateExtract(blob, append, title, description)
-    blobstring = blob.to_s
-    begin
-      # See below, but with yyyy-mm-dd (and months can only start with 0-1
-      if blobstring.match(/(?:19|20)\d{2}(?:-|\/)[0-1]?[0-9](?:-|\/)[0-3]?[0-9]/)
-        save = Regexp.last_match.to_s
-        saveparse = save.gsub("-", "/") # Needed for american_date gem
-        addItem(Date.parse(saveparse).to_s, append, title, description, blobstring, save)
-        # mm-dd-yyyy, mm/dd/yy, and similar. m or d must start with 0-3 (optional) and end in not 0
-        # Year can only start with 19 or 20 if it is four chars, or it could be 2 char
-      elsif blobstring.match(/[0-3]?[0-9](?:-|\/)[0-3]?[0-9](?:-|\/)(?:(?:19|20)\d{2}|\d{2})/)
-        save = Regexp.last_match.to_s
-        saveparse = save.gsub("-", "/")
-        addItem(Date.parse(saveparse).to_s, append, title, description, blobstring, save)
-        # Same as below but with dd before instead of in middle and supports two digit year
-        # Matches: dd Month yyyy, ddth Month yyyy, ddmonthyy ddthmonthyyyy
-      elsif blobstring.match(/(?:(?:[0-3]?[0-9])(?:st|nd|rd|th)?) *(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Nov(?:ember)?|Dec(?:ember)?) *(?:19|20)?\d{2}/i)
-        save = Regexp.last_match.to_s
-        addItem(Date.parse(save).to_s, append, title, description, blobstring, save)
-        # Matches: Month yyyy, Month dd yyyy, Month ddth yyyy, Month dd, yyyy, Month ddth, yyyy,
-        # Month can be full month or abbreviation, optional dd with 1 optional th/st/nd/rd, yyyy starting in 19 or 20
-        # Case insensitive, optional/variable spaces
-      elsif blobstring.match(/(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Nov(?:ember)?|Dec(?:ember)?) *(?:(?:[0-3]?[0-9])(?:st|nd|rd|th)?)?,? *(?:19|20)\d{2}/i)
-        save = Regexp.last_match.to_s
-        addItem(Date.parse(save).to_s, append, title, description, blobstring, save)
-        # Matches: yyyy
-        # Must start and end with word boundry, year must start with 19 or 20 and be 4 numbers
-      elsif blobstring.match(/\b(?:19|20)\d{2}\b/)
-        save = Regexp.last_match.to_s
-        addItem(Date.parse(Date.new(save.to_i).to_s), append, title, description, blobstring, save)
-      end
-    rescue
-    end
-  end
-  # Adds and item to the hash
-  def addItem(date, append, title, description, blob, regex)
-    shash = Hash.new
-    shash[:parsed_date] = date
-    shash[:raw_date] = regex
-    shash[:short_chunk] = title
-    # Append fields specified
-    unless append == {nil=>nil}
-      append.each do |k, v|
-        shash[k] = v
-      end
-    end
-    flag = 0
-    @output.each do |o|
-      if (o[:parsed_date] == shash[:parsed_date]) && (o[:short_chunk].to_s == shash[:short_chunk].to_s)
-        flag = 1
-        break
-      end
-    end
-    if flag == 0
-      @output.push(shash)
-    end
-    blob.slice! regex
-    dateExtract(blob, append, title, description)
-  end
-end

data/lib/handleinput.rb DELETED

@@ -1,148 +0,0 @@
-require 'json'
-class HandleInput
-  def initialize(json, ignorefields, caseinfo)
-    @json = json
-    if ignorefields
-      @ignorefields = ignorefields
-    else
-      @ignorefields = Array.new
-    end
-    @caseinfo = caseinfo
-    @output = Array.new
-    @outhash = Hash.new
-  end
-  # Map output to value
-  def mapout(addlist, mapto)
-    outarr = Array.new
-    addlist.each do |a|
-      if mapto == "key"
-        @json.each do |k, v|
-          # If it's a nested hash
-          if v.is_a? Hash
-            # Go through all values
-            v.each do |z, w|
-              # Check if k is already included
-              if !outarr.include? k
-                if w == a
-                  outarr.push(k)
-                end
-              end
-            end
-          else
-            # Map for dictionaries
-            if !outarr.include? k
-              if v == a || k == a
-                outarr.push(k)
-              end
-            end
-          end
-        end
-      elsif mapto == "value"
-        @json.each do |k, v|
-          if !v.is_a? Hash
-            if !outarr.include? v
-              if k == a || v == a
-                outarr.push(v)
-              end
-            end
-          end
-        end
-      else
-        @json.each do |k, v|
-          v.each do |z, w|
-            # Only map if not already matched
-            if !outarr.include? v[mapto]
-              # Check if vals match
-              if w == a
-                outarr.push(v[mapto])
-              end
-            end
-          end
-        end
-      end
-    end
-    return outarr
-  end
-  # Figure out which type of input it is: array, hash, hash with hash values
-  def detecttype
-    if @json.is_a? Array
-      @output = @json
-      checkCase
-    elsif @json.is_a? Hash
-      @json.each do |k, v|
-        if v.is_a? Hash
-          parseValHash
-          break
-        else
-          parseDictionary
-          break
-        end
-      end
-    end
-    return @outhash
-  end
-  # Adds case sensitive preferences
-  def checkCase
-    if @caseinfo == "casesensitive"
-      @output.each do |i|
-        @outhash[i] = true
-      end
-    elsif @caseinfo == "noncasesensitive"
-      @output.each do |i|
-        @outhash[i] = false
-      end
-    end
-  end
-  # Handle hashes where the values are a hash
-  def parseValHash
-    @json.each do |k, v|
-      if !@ignorefields.include? "hashkey"
-        if @caseinfo.include? "hashkey"
-          @outhash[k] = false
-        else
-          @outhash[k] = true
-        end
-      end
-      v.each do |i, j|
-        if !@ignorefields.include? i
-          if @caseinfo.include? i
-            @outhash[j] = false
-          else
-            @outhash[j] = true
-          end
-        end
-      end
-    end
-  end
-  # Handle hashes
-  def parseDictionary
-    @json.each do |k, v|
-      if !@ignorefields.include? "hashkey"
-        if @caseinfo.include? "hashkey"
-          @outhash[k] = false
-        else
-          @outhash[k] = true
-        end
-      end
-      if !@ignorefields.include? "hashval"
-        if @caseinfo.include? "hashval"
-          @outhash[v] = false
-        else
-          @outhash[v] = true
-        end
-      end
-    end
-  end
-end