RubyGems - phrasie - Versions diffs - 0.1.3 → 0.1.4 - Mend

phrasie 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

data/README.md CHANGED

@@ -8,7 +8,7 @@ statistical analysis to determine the terms and their strength.
 Based on the excellent Python library [topia.termextract](http://pypi.python.org/pypi/topia.termextract/).
-**Tested on Ruby 1.9.2.**
+**Tested on Ruby 1.8.7 and 1.9.2.**
 ## SYNOPSIS:

data/lib/phrasie.rb CHANGED

@@ -3,6 +3,8 @@ $:.unshift(File.dirname(__FILE__)) unless
 require 'phrasie/rules'
 require 'phrasie/tag'
-require 'phrasie/extractor'
-VERSION = '0.1.3'
+require 'phrasie/extractor'
+module Phrasie
+  VERSION = '0.1.4'
+end

data/lib/phrasie/extractor.rb CHANGED

@@ -7,14 +7,15 @@ module Phrasie
     def initialize(options={})
       self.tagger = Tagger.new
-      self.filter = options[:filter] || {:strength => 2, :occur => 3}
+      self.filter = {:strength => 2, :occur => 3}.merge(options[:filter] || {})
     end
     def to_s
       "#<Phrasie::Extractor>"
     end
-    def phrases(input, min_occur=3)
+    # Returns an array of [phrase, occurances, # of words in phrase]
+    def phrases(input, filter=nil)
       if input.is_a? String
         taggedTerms = self.tagger.tag(input)
       elsif input.is_a? Array
@@ -22,6 +23,13 @@ module Phrasie
       else
         return []
       end
+      unless filter.nil?
+        self.filter = self.filter.merge(filter)
+        if self.filter[:occur].to_s[/%/]
+          self.filter[:occur] = [(taggedTerms.size * 0.01), 2].sort.last.round
+        end
+      end
       terms = {}
       multiterm = []
@@ -29,15 +37,15 @@ module Phrasie
       while taggedTerms.size > 0
         term, tag, norm = taggedTerms.shift
-        if state == SEARCH && tag[0] == "N"
+        if state == SEARCH && tag[0,1] == "N"
           state = NOUN
           add(term, norm, multiterm, terms)
-        elsif state == SEARCH && tag == 'JJ' && term[0].upcase == term[0]
+        elsif state == SEARCH && tag == 'JJ' && term[0,1].upcase == term[0,1]
           state = NOUN
           add(term, norm, multiterm, terms)
-        elsif state == NOUN && tag[0] == "N"
+        elsif state == NOUN && tag[0,1] == "N"
           add(term, norm, multiterm, terms)
-        elsif state == NOUN && tag[0] != "N"
+        elsif state == NOUN && tag[0,1] != "N"
           state = SEARCH
           if multiterm.size > 1
             word = multiterm.map(&:first).join(' ')
@@ -47,18 +55,21 @@ module Phrasie
           multiterm = []
         end
       end
       return terms \
               .map{|phrase, occurance| [phrase, occurance, phrase.split.size] } \
-              .keep_if{|arr| self.validate(*arr)} \
+              .delete_if{|arr| !self.validate(*arr)} \
               .sort_by{|phrase, occurance, strength|  occurance + ((occurance/5.0)*strength) }.reverse
     end
     protected
+    # Validates the phrase is within the bounds of our filter
     def validate(word, occur, strength)
       occur >= self.filter[:occur] || (occur >= 2 && strength >= self.filter[:strength])
     end
+    # Used within #phrases
     def add(term, norm, multiterm, terms)
       multiterm << [term, norm]
       terms[norm] ||= 0

data/lib/phrasie/tag.rb CHANGED

@@ -11,6 +11,7 @@ module Phrasie
       self.tags_by_term = Hash[file.split("\n").map{|line| line.split.first(2)}]
     end
+    # Takes some input text and outputs an array of the words contained in it.
     def tokenize(text)
       terms = []
       text.split(/\s/).each do |term|
@@ -29,6 +30,8 @@ module Phrasie
       return terms
     end
+    # Takes an array from #tokenize, or a string which it pipes through #tokenize,
+    #   and returns the words with part-of-speech tags.
     def tag(input)
       if input.is_a? String
         terms = self.tokenize(input)
@@ -43,7 +46,8 @@ module Phrasie
         tag = self.tags_by_term[term] || "NND"
         tagged_terms << [term, tag, term]
       end
+      # These rules are definied in rules.rb
       rules = [
         'correctDefaultNounTag',
         'verifyProperNounAtSentenceStart',
@@ -56,7 +60,9 @@ module Phrasie
           id, tagged_terms[id], tagged_terms = self.send(rule.to_sym, id, tagged_term, tagged_terms)
         end
       end
       return tagged_terms
     end
   end
 end

data/phrasie.gemspec CHANGED

@@ -1,6 +1,6 @@
 Gem::Specification.new do |s|
   s.name        = "phrasie"
-  s.version     = '0.1.3'
+  s.version     = '0.1.4'
   s.authors     = ["Ashley Williams"]
   s.email       = ["hi@ashleyw.co.uk"]
   s.summary     = "Determines important terms within a given piece of content."

data/test/test_phrasie.rb CHANGED

@@ -64,31 +64,6 @@ class TestPhrasie < Test::Unit::TestCase
     assert_equal 7, @extractor.phrases(text).size
   end
-  #   [["Jerusalem", 8, 1],
-  #    ["event", 6, 1],
-  #    ["Palestinian", 6, 1],
-  #    ["East Jerusalem", 4, 2],
-  #    ["East", 4, 1],
-  #    ["police", 4, 1],
-  #    ["Israel", 4, 1],
-  #    ["theatre", 3, 1],
-  #    ["Palestinian theatre", 2, 2],
-  #    ["Palestinian Authority", 2, 2],
-  #    ["opening event", 1, 2],
-  #    ["Israeli authorities", 1, 2],
-  #    ["Richard Makepeace", 1, 2],
-  #    ["court order", 1, 2],
-  #    ["literature festival", 1, 2],
-  #    ["British consul-general", 1, 2],
-  #    ["police notice", 1, 2],
-  #    ["security minister", 1, 2],
-  #    ["Israeli police", 1, 2],
-  #    ["peace accords", 1, 2],
-  #    ["Mr Makepeace", 1, 2],
-  #    ["British Council", 1, 2],
-  #    ["Palestinian state", 1, 2],
-  #    ["Palestinians hope", 1, 2]]
   def test_long_text
     assert_equal 10, @extractor.phrases(@long_text).size
   end

metadata CHANGED

@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
   segments:
   - 0
   - 1
-  - 3
-  version: 0.1.3
+  - 4
+  version: 0.1.4
 platform: ruby
 authors:
 - Ashley Williams