RubyGems - text_nlp - Versions diffs - 0.0.2 → 0.0.3 - Mend

text_nlp 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

data/Gemfile +2 -0
data/Gemfile.lock +6 -0
data/lib/text_nlp.rb +1 -3
data/lib/text_nlp/expressions.rb +2 -2
data/lib/text_nlp/normalizer.rb +1 -1
data/lib/text_nlp/pattern.rb +14 -109
data/lib/text_nlp/stop_list.rb +44 -0
data/lib/text_nlp/stoplists/min_fr.txt +43 -0
data/lib/text_nlp/string.rb +13 -5
data/lib/text_nlp/synonyms.rb +5 -4
data/lib/text_nlp/tokenizer.rb +1 -1
data/spec/min_en.txt +2 -0
data/spec/min_fr.txt +3 -0
data/spec/pattern_spec.rb +25 -5
data/spec/stop_list_spec.rb +34 -0
data/spec/stop_list_toto.txt +2 -0
data/spec/stop_list_tutu.txt +2 -0
data/spec/string_spec.rb +23 -4
data/spec/synonyms_spec.rb +10 -8
data/text_nlp.gemspec +7 -2
metadata +43 -3

data/Gemfile CHANGED

@@ -1,5 +1,7 @@
 source :rubygems
+gem 'textquery'
 group :test do
   gem 'rspec'
 end

data/Gemfile.lock CHANGED

@@ -2,6 +2,7 @@ GEM
   remote: http://rubygems.org/
   specs:
     diff-lcs (1.1.2)
+    polyglot (0.3.1)
     rspec (2.6.0)
       rspec-core (~> 2.6.0)
       rspec-expectations (~> 2.6.0)
@@ -10,9 +11,14 @@ GEM
     rspec-expectations (2.6.0)
       diff-lcs (~> 1.1.2)
     rspec-mocks (2.6.0)
+    textquery (0.1.8)
+      treetop
+    treetop (1.4.9)
+      polyglot (>= 0.3.1)
 PLATFORMS
   ruby
 DEPENDENCIES
   rspec
+  textquery

data/lib/text_nlp.rb CHANGED

@@ -8,6 +8,4 @@ require "text_nlp/string.rb"
 require "text_nlp/expressions.rb"
 require "text_nlp/pattern.rb"
 require "text_nlp/synonyms.rb"
-class TextNlp
-end
+require "text_nlp/stop_list.rb"

data/lib/text_nlp/expressions.rb CHANGED

@@ -12,7 +12,7 @@ class TextNlp
     def <<(expression)
       node = @root
-      expression = expression.normalize
+      expression.normalize!
       @values << expression
       tokens = expression.tokenize
       tokens_count = tokens.size
@@ -40,7 +40,7 @@ class TextNlp
     end
     def find(text)
-      find_expressions(0,text.normalize.tokenize.map { |t| t })
+      find_expressions(0,text.normalize.tokenize)
     end
     private

data/lib/text_nlp/normalizer.rb CHANGED

@@ -9,6 +9,6 @@ class TextNlp
       text.gsub!(/\s+/," ")
       text.strip!
       text
-    end
+    end
   end
 end

data/lib/text_nlp/pattern.rb CHANGED

@@ -1,122 +1,27 @@
 # encoding: UTF-8
+require 'textquery'
 class TextNlp
   class Pattern
-    attr_reader :root
-    def initialize(root_or_string = nil)
-      if (root_or_string.is_a?(String))
-        @root = parse(root_or_string)
-      else
-        @root = root_or_string
+    def initialize(pattern, options = {})
+      options = {:normalize => true}.merge(options)
+      if options[:normalize]
+        normalize_pattern(pattern)
+        @to_normalize = true
       end
+      @text_query = TextQuery.new(pattern, {:ignorecase => options[:normalize]})
     end
-    def <<(node)
-      @root << node
-    end
     def match?(text)
-      @root.evaluate(text)
+      text.normalize! if @to_normalize
+      @text_query.match?(text)
     end
     private
-    def parse(expr)
-      operators = ['||','&&']
-      current_expression, node, opened, closed = '', nil, 0, 0
-      expr.chars.each_with_index do |char,i|
-        if (char == '(')
-          opened += 1
-          current_expression << char if ((opened - closed) > 1)
-        elsif (char == ')')
-          closed += 1
-          current_expression << char if ((opened - closed) > 0)
-        elsif ((opened == closed) && (operators.include?(expr[i-1..i])))
-          node = operator_node(expr[i-1..i])
-          node << parse(current_expression[0..-2])
-          node << parse(expr[i+1..-1])
-          break;
-        else
-          current_expression << char
-        end
-      end
-      unless node
-        if (current_expression.match(/\|{2}|&{2}/))
-          node = parse(current_expression)
-        else
-          node = current_expression[0..0] == '!' ? Not.new(current_expression[1..-1]) : Unary.new(current_expression)
-        end
-      end
-      node
-    end
-    def operator_node(operator)
-      node = case operator
-        when '||' then Or.new
-        when '&&' then And.new
-      end
-      node
-    end
-    class Composite
-      attr_reader :nodes
-      def initialize(*nodes)
-        @nodes = nodes || []
-      end
-      def <<(node)
-        @nodes << node
-      end
-      def values
-        @nodes.map { |node| node.values }.flatten
-      end
-    end
-    class And < Composite
-      def evaluate(expr)
-        @nodes.each do |node|
-          return false unless node.evaluate(expr)
-        end
-        return true
-      end
-    end
-    class Or < Composite
-      def evaluate(expr)
-        @nodes.each do |node|
-          return true if node.evaluate(expr)
-        end
-        return false
-      end
-    end
-    class Unary
-      attr_reader :value
-      def initialize(value)
-        @value = value
-        @expressions = Expressions.new([@value])
-      end
-      def evaluate(expr)
-        @expressions.any?(expr)
-      end
-      def values
-        [value]
-      end
-    end
-    class Not < Unary
-      def evaluate(expr)
-        !super(expr)
-      end
-      def values
-        []
-      end
+    def normalize_pattern(pattern)
+      pattern.tr!("éèàçîêô","eeacieo")
+      pattern.tr!("!,;?./\\_|[]{}<>:*$%"," ")
     end
   end

data/lib/text_nlp/stop_list.rb ADDED

@@ -0,0 +1,44 @@
+# encoding: UTF-8
+class TextNlp
+  class StopList
+    class << self
+      attr_accessor :directory
+      StopList.directory = File.join(File.dirname(__FILE__),'stoplists')
+    end
+    def initialize(options = {})
+      @cache = {}
+      options = {:expressions => []}.merge(options)
+      expressions = options[:expressions]
+      if (options.key?(:name))
+        File.foreach(File.join(StopList.directory,"#{options[:name]}.txt")) { |e| expressions << e }
+      end
+      if (options.key?(:names))
+        options[:names].each do |name|
+          File.foreach(File.join(StopList.directory,"#{name}.txt")) { |e| expressions << e }
+        end
+      end
+      if (options.key?(:file))
+        File.foreach(options[:file]) { |e| expressions << e }
+      end
+      if (options.key?(:files))
+        options[:files].each do |file|
+          File.foreach(file) { |e| expressions << e }
+        end
+      end
+      expressions.each { |e| @cache[e.normalize] = true }
+      @expressions = TextNlp::Expressions.new(expressions)
+    end
+    def transform(text)
+      @expressions.expressionize(text).map { |expr| @cache.key?(expr) ? nil : expr }.compact.join(' ')
+    end
+    def size
+      @expressions.values.size
+    end
+  end
+end

data/lib/text_nlp/stoplists/min_fr.txt ADDED

@@ -0,0 +1,43 @@
+a
+au
+aussi
+aux
+avec
+c
+ce
+cette
+contre
+d
+dans
+de
+des
+du
+en
+et
+j
+l
+la
+le
+les
+mais
+n
+ou
+par
+pas
+pour
+qu
+que
+quel
+quelle
+quelles
+quels
+qui
+sa
+sans
+ses
+son
+sous
+sur
+un
+une
+y

data/lib/text_nlp/string.rb CHANGED

@@ -18,23 +18,31 @@ class String
     self
   end
+  def normalize!
+    unless normalized()
+      replace(self.normalize)
+      self.normalized = true
+    end
+    self
+  end
   def tokenize
     (String.tokenizer || TextNlp::Tokenizer.new).tokenize(self)
   end
   def similarity(text)
     score = 0.0
-    tokens1 = self.normalize.tokenize
-    tokens2 = text.normalize.tokenize
+    tokens1, tokens2 = self.normalize.tokenize, text.normalize.tokenize
     if (tokens1.size > 0 && tokens2.size > 0)
       intersection = tokens1 & tokens2
-      score = (((intersection.size.to_f / tokens1.size.to_f) + (intersection.size.to_f / tokens2.size.to_f)) / 2)
+      score = (((intersection.size.to_f / tokens1.size) + (intersection.size.to_f / tokens2.size)) / 2)
     end
     score
   end
-  def translate(translator)
-    translator.translate(self)
+  def transform(*transformers)
+    transformers = [transformers] unless transformers.respond_to?(:each)
+    transformers.flatten.inject(self) { |text,transformer| transformer.transform(text) }
   end
 end

data/lib/text_nlp/synonyms.rb CHANGED

@@ -13,15 +13,16 @@ class TextNlp
     end
     def register(name,synonyms)
-      normalized_name = name.normalize
+      name.normalize!
       synonyms.each do |synonym|
+        synonym.normalize!
         @expressions << synonym
-        @synonyms[synonym.normalize] = normalized_name
+        @synonyms[synonym] = name
       end
     end
-    def translate(text)
-      @expressions.expressionize(text).map { |expr| @synonyms.key?(expr) ? @synonyms[expr] : expr }.join(' ')
+    def transform(text)
+      @expressions.expressionize(text).map { |expr| @synonyms.key?(expr) ? @synonyms[expr] : expr }.compact.join(' ')
     end
   end

data/lib/text_nlp/tokenizer.rb CHANGED

@@ -1,7 +1,7 @@
 # encoding: UTF-8
 class TextNlp
-  class Tokenizer
+  class Tokenizer
     def tokenize(text)
       text.split(/\s+/)
     end

data/spec/min_en.txt ADDED

	@@ -0,0 +1,2 @@
1	+ you
2	+ an

data/spec/min_fr.txt ADDED

@@ -0,0 +1,3 @@
+le
+de
+un

data/spec/pattern_spec.rb CHANGED

@@ -3,11 +3,31 @@ require "spec_helper"
 describe TextNlp::Pattern do
-  it "should match or not the pattern" do
-    TextNlp::Pattern.new("((bd)||(bande dessinée))&&!samsung").match?("cette bd est super").should be_true
-    TextNlp::Pattern.new("((bd)||(bande dessinée))&&!samsung").match?("cette bd est illisible sur samsung NTC").should be_false
-    TextNlp::Pattern.new("((bd)||(bande dessinée))&&!samsung").match?("cette bande dessinée est illisible sur samsung NTC").should be_false
-    TextNlp::Pattern.new("((bd)||(bande dessinée))&&!samsung").match?("cette bande dessinée est illisible").should be_true
+  context "with normalize option" do
+    it "should match or not the pattern" do
+      pattern = TextNlp::Pattern.new("(BD OR 'bande dessinée') AND -samsung")
+      pattern.match?("cette BD est super").should be_true
+      pattern.match?("cette bd est illisible sur samsung NTC").should be_false
+      pattern.match?("cette bande dessinee est illisible sur samsung NTC").should be_false
+      pattern.match?("cette bande dessinee est illisible").should be_true
+      pattern = TextNlp::Pattern.new("'toulouse fc' OR ((toulouse OR tfc) AND (foot OR football OR 'ligue 1' OR 'ligue 2' OR l1 OR l2))")
+      pattern.match?("toulouse est une belle ville").should be_false
+    end
   end
+  context "with no normalized option" do
+    it "should match or not the pattern" do
+      pattern = TextNlp::Pattern.new("(BD OR 'bande dessinée') AND -samsung", :normalize => false)
+      pattern.match?("cette BD est super").should be_true
+      pattern.match?("cette bd est super").should be_false
+      pattern.match?("cette bande dessinee est illisible").should be_false
+      pattern.match?("cette bande dessinée est illisible").should be_true
+      pattern.match?("cette bande dessinée est illisible sur samsung").should be_false
+    end
+  end
 end

data/spec/stop_list_spec.rb ADDED

@@ -0,0 +1,34 @@
+# encoding: utf-8
+require "spec_helper"
+describe TextNlp::StopList do
+  it "should remove the words/expressions defined by the stop list" do
+    TextNlp::StopList.directory = File.dirname(__FILE__)
+    stop_list = TextNlp::StopList.new(:expressions => ['il','a','ecrit par toto'])
+    stop_list.size.should eq 3
+    stop_list.transform("bordel Il fait chaud ici").should eq 'bordel fait chaud ici'
+    stop_list.transform("bordel Il fait chaud ici ecrit par toto").should eq 'bordel fait chaud ici'
+    stop_list.transform("bordel Il fait chaud ici ecrit par titi").should eq 'bordel fait chaud ici ecrit par titi'
+    stop_list = TextNlp::StopList.new(:expressions => ['il','a','ecrit par toto'], :file => File.join(File.dirname(__FILE__),"stop_list_toto.txt"))
+    stop_list.size.should eq 5
+    stop_list.transform("bordel Il fait chaud ici").should eq 'fait chaud ici'
+    stop_list.transform("bordel Il fait chaud ici ecrit par toto").should eq 'fait chaud ici'
+    stop_list.transform("bordel Il fait chaud ici ecrit par titi").should eq 'fait chaud ici ecrit par titi'
+    stop_list = TextNlp::StopList.new(:name => "min_fr")
+    stop_list.size.should eq 3
+    stop_list.transform("le ballon de zizou").should eq 'ballon zizou'
+    stop_list = TextNlp::StopList.new(:names => ["min_fr","min_en"])
+    stop_list.size.should eq 5
+    stop_list = TextNlp::StopList.new(
+      :files => [File.join(File.dirname(__FILE__),"stop_list_toto.txt"),File.join(File.dirname(__FILE__),"stop_list_tutu.txt")])
+    stop_list.size.should eq 4
+  end
+end

data/spec/stop_list_toto.txt ADDED

	@@ -0,0 +1,2 @@
1	+ zut
2	+ bordel

data/spec/stop_list_tutu.txt ADDED

	@@ -0,0 +1,2 @@
1	+ fsdfsdfsdf sdfdsf
2	+ eoirezoir uoi ioio

data/spec/string_spec.rb CHANGED

@@ -17,6 +17,16 @@ describe String do
     text.normalize.should eq "TOTO".downcase
   end
+  it "should normalize the receiver string" do
+    text = "TOTO"
+    normalizer = double()
+    String.normalizer = normalizer
+    normalizer.stub(:normalize) { |txt| txt.downcase }
+    text.normalize!
+    text.should eq "TOTO".downcase
+    text.normalized.should be_true
+  end
   it "should call tokenizer" do
     text = "TOTO"
     tokenizer = double()
@@ -25,11 +35,20 @@ describe String do
     text.tokenize
   end
-  it "should call translator" do
+  it "should call translator / translators" do
     text = "TOTO"
-    translator = double()
-    translator.should_receive(:translate).with(text)
-    text.translate(translator)
+    transformer1 = double()
+    transformer1.should_receive(:transform).with(text)
+    text.transform(transformer1)
+    transformer1 = double()
+    transformer1.stub(:transform) { |text| text.tr("T","U") }
+    transformer2 = double()
+    transformer2.stub(:transform) { |text| text.tr("O","A") }
+    transformer1.should_receive(:transform).with("TOTO")
+    transformer2.should_receive(:transform).with("UOUO")
+    text = text.transform(transformer1,transformer2)
+    text.should eq "UAUA"
+    text.transform([transformer1,transformer2])
   end
   it "should compute similarity" do

data/spec/synonyms_spec.rb CHANGED

@@ -6,18 +6,20 @@ describe TextNlp::Synonyms do
   it "should synonymize the text" do
     synonyms = TextNlp::Synonyms.new
     synonyms.register("CAEN",["smc","sm caen","stade malherbe de caen"])
-    synonyms.translate("le smc c est de la bombe").should eq "le caen c est de la bombe"
-    synonyms.translate("le truc c est de la bombe").should eq "le truc c est de la bombe"
-    synonyms.translate("le sm caen c est de la bombe").should eq "le caen c est de la bombe"
-    synonyms.translate("le stade malherbe de caen c est de la bombe").should eq "le caen c est de la bombe"
+    synonyms.transform("le smc c est de la bombe").should eq "le caen c est de la bombe"
+    synonyms.transform("le truc c est de la bombe").should eq "le truc c est de la bombe"
+    synonyms.transform("le sm caen c est de la bombe").should eq "le caen c est de la bombe"
+    synonyms.transform("le stade malherbe de caen c est de la bombe").should eq "le caen c est de la bombe"
   end
   it "should synonymize the text" do
     synonyms = TextNlp::Synonyms.new([["CAEN","smc","sm caen","stade malherbe de caen"],["marseille","om"]])
-    synonyms.translate("le smc c est de la bombe").should eq "le caen c est de la bombe"
-    synonyms.translate("le truc c est de la bombe").should eq "le truc c est de la bombe"
-    synonyms.translate("le sm caen c est de la bombe").should eq "le caen c est de la bombe"
-    synonyms.translate("le stade malherbe de caen c est de la bombe").should eq "le caen c est de la bombe"
+    synonyms.transform("le smc c est de la bombe").should eq "le caen c est de la bombe"
+    synonyms.transform("le truc c est de la bombe").should eq "le truc c est de la bombe"
+    synonyms.transform("le sm caen c est de la bombe").should eq "le caen c est de la bombe"
+    synonyms.transform("le stade malherbe de caen c est de la bombe").should eq "le caen c est de la bombe"
+    synonyms.transform("le caen c est de la bombe").should eq "le caen c est de la bombe"
+    synonyms.transform("le om c est de la bombe").should eq "le marseille c est de la bombe"
   end
 end

data/text_nlp.gemspec CHANGED

@@ -1,9 +1,14 @@
 Gem::Specification.new do |s|
   s.name          = 'text_nlp'
-  s.version       = '0.0.2'
-  s.date          = '2011-07-05'
+  s.version       = '0.0.3'
+  s.date          = '2011-07-07'
   s.summary       = "A minimalist NLP library"
   s.description   = s.summary
+  s.add_dependency "textquery"
+  s.add_development_dependency "rspec"
+  s.add_development_dependency "rake"
   s.authors       = ["fonzo14"]
   s.require_paths = ["lib"]
   s.files         = `git ls-files`.split("\n")

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: text_nlp
 version: !ruby/object:Gem::Version
-  version: 0.0.2
+  version: 0.0.3
   prerelease:
 platform: ruby
 authors:
@@ -9,8 +9,41 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2011-07-05 00:00:00.000000000Z
-dependencies: []
+date: 2011-07-07 00:00:00.000000000Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: textquery
+  requirement: &86270380 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: *86270380
+- !ruby/object:Gem::Dependency
+  name: rspec
+  requirement: &86270160 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: *86270160
+- !ruby/object:Gem::Dependency
+  name: rake
+  requirement: &86269950 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: *86269950
 description: A minimalist NLP library
 email:
 executables: []
@@ -29,13 +62,20 @@ files:
 - lib/text_nlp/expressions.rb
 - lib/text_nlp/normalizer.rb
 - lib/text_nlp/pattern.rb
+- lib/text_nlp/stop_list.rb
+- lib/text_nlp/stoplists/min_fr.txt
 - lib/text_nlp/string.rb
 - lib/text_nlp/synonyms.rb
 - lib/text_nlp/tokenizer.rb
 - spec/expressions_spec.rb
+- spec/min_en.txt
+- spec/min_fr.txt
 - spec/normalizer_spec.rb
 - spec/pattern_spec.rb
 - spec/spec_helper.rb
+- spec/stop_list_spec.rb
+- spec/stop_list_toto.txt
+- spec/stop_list_tutu.txt
 - spec/string_spec.rb
 - spec/synonyms_spec.rb
 - spec/tokenizer_spec.rb