text_nlp 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Gemfile CHANGED
@@ -1,5 +1,7 @@
1
1
  source :rubygems
2
2
 
3
+ gem 'textquery'
4
+
3
5
  group :test do
4
6
  gem 'rspec'
5
7
  end
@@ -2,6 +2,7 @@ GEM
2
2
  remote: http://rubygems.org/
3
3
  specs:
4
4
  diff-lcs (1.1.2)
5
+ polyglot (0.3.1)
5
6
  rspec (2.6.0)
6
7
  rspec-core (~> 2.6.0)
7
8
  rspec-expectations (~> 2.6.0)
@@ -10,9 +11,14 @@ GEM
10
11
  rspec-expectations (2.6.0)
11
12
  diff-lcs (~> 1.1.2)
12
13
  rspec-mocks (2.6.0)
14
+ textquery (0.1.8)
15
+ treetop
16
+ treetop (1.4.9)
17
+ polyglot (>= 0.3.1)
13
18
 
14
19
  PLATFORMS
15
20
  ruby
16
21
 
17
22
  DEPENDENCIES
18
23
  rspec
24
+ textquery
@@ -8,6 +8,4 @@ require "text_nlp/string.rb"
8
8
  require "text_nlp/expressions.rb"
9
9
  require "text_nlp/pattern.rb"
10
10
  require "text_nlp/synonyms.rb"
11
-
12
- class TextNlp
13
- end
11
+ require "text_nlp/stop_list.rb"
@@ -12,7 +12,7 @@ class TextNlp
12
12
 
13
13
  def <<(expression)
14
14
  node = @root
15
- expression = expression.normalize
15
+ expression.normalize!
16
16
  @values << expression
17
17
  tokens = expression.tokenize
18
18
  tokens_count = tokens.size
@@ -40,7 +40,7 @@ class TextNlp
40
40
  end
41
41
 
42
42
  def find(text)
43
- find_expressions(0,text.normalize.tokenize.map { |t| t })
43
+ find_expressions(0,text.normalize.tokenize)
44
44
  end
45
45
 
46
46
  private
@@ -9,6 +9,6 @@ class TextNlp
9
9
  text.gsub!(/\s+/," ")
10
10
  text.strip!
11
11
  text
12
- end
12
+ end
13
13
  end
14
14
  end
@@ -1,122 +1,27 @@
1
1
  # encoding: UTF-8
2
+ require 'textquery'
2
3
 
3
4
  class TextNlp
4
5
  class Pattern
5
-
6
- attr_reader :root
7
-
8
- def initialize(root_or_string = nil)
9
- if (root_or_string.is_a?(String))
10
- @root = parse(root_or_string)
11
- else
12
- @root = root_or_string
6
+
7
+ def initialize(pattern, options = {})
8
+ options = {:normalize => true}.merge(options)
9
+ if options[:normalize]
10
+ normalize_pattern(pattern)
11
+ @to_normalize = true
13
12
  end
13
+ @text_query = TextQuery.new(pattern, {:ignorecase => options[:normalize]})
14
14
  end
15
-
16
- def <<(node)
17
- @root << node
18
- end
19
-
15
+
20
16
  def match?(text)
21
- @root.evaluate(text)
17
+ text.normalize! if @to_normalize
18
+ @text_query.match?(text)
22
19
  end
23
20
 
24
21
  private
25
- def parse(expr)
26
- operators = ['||','&&']
27
- current_expression, node, opened, closed = '', nil, 0, 0
28
- expr.chars.each_with_index do |char,i|
29
- if (char == '(')
30
- opened += 1
31
- current_expression << char if ((opened - closed) > 1)
32
- elsif (char == ')')
33
- closed += 1
34
- current_expression << char if ((opened - closed) > 0)
35
- elsif ((opened == closed) && (operators.include?(expr[i-1..i])))
36
- node = operator_node(expr[i-1..i])
37
- node << parse(current_expression[0..-2])
38
- node << parse(expr[i+1..-1])
39
- break;
40
- else
41
- current_expression << char
42
- end
43
- end
44
- unless node
45
- if (current_expression.match(/\|{2}|&{2}/))
46
- node = parse(current_expression)
47
- else
48
- node = current_expression[0..0] == '!' ? Not.new(current_expression[1..-1]) : Unary.new(current_expression)
49
- end
50
- end
51
- node
52
- end
53
-
54
- def operator_node(operator)
55
- node = case operator
56
- when '||' then Or.new
57
- when '&&' then And.new
58
- end
59
- node
60
- end
61
-
62
- class Composite
63
- attr_reader :nodes
64
-
65
- def initialize(*nodes)
66
- @nodes = nodes || []
67
- end
68
-
69
- def <<(node)
70
- @nodes << node
71
- end
72
-
73
- def values
74
- @nodes.map { |node| node.values }.flatten
75
- end
76
- end
77
-
78
- class And < Composite
79
- def evaluate(expr)
80
- @nodes.each do |node|
81
- return false unless node.evaluate(expr)
82
- end
83
- return true
84
- end
85
- end
86
-
87
- class Or < Composite
88
- def evaluate(expr)
89
- @nodes.each do |node|
90
- return true if node.evaluate(expr)
91
- end
92
- return false
93
- end
94
- end
95
-
96
- class Unary
97
- attr_reader :value
98
-
99
- def initialize(value)
100
- @value = value
101
- @expressions = Expressions.new([@value])
102
- end
103
-
104
- def evaluate(expr)
105
- @expressions.any?(expr)
106
- end
107
-
108
- def values
109
- [value]
110
- end
111
- end
112
-
113
- class Not < Unary
114
- def evaluate(expr)
115
- !super(expr)
116
- end
117
- def values
118
- []
119
- end
22
+ def normalize_pattern(pattern)
23
+ pattern.tr!("éèàçîêô","eeacieo")
24
+ pattern.tr!("!,;?./\\_|[]{}<>:*$%"," ")
120
25
  end
121
26
 
122
27
  end
@@ -0,0 +1,44 @@
1
+ # encoding: UTF-8
2
+
3
+ class TextNlp
4
+ class StopList
5
+
6
+ class << self
7
+ attr_accessor :directory
8
+ StopList.directory = File.join(File.dirname(__FILE__),'stoplists')
9
+ end
10
+
11
+ def initialize(options = {})
12
+ @cache = {}
13
+ options = {:expressions => []}.merge(options)
14
+ expressions = options[:expressions]
15
+ if (options.key?(:name))
16
+ File.foreach(File.join(StopList.directory,"#{options[:name]}.txt")) { |e| expressions << e }
17
+ end
18
+ if (options.key?(:names))
19
+ options[:names].each do |name|
20
+ File.foreach(File.join(StopList.directory,"#{name}.txt")) { |e| expressions << e }
21
+ end
22
+ end
23
+ if (options.key?(:file))
24
+ File.foreach(options[:file]) { |e| expressions << e }
25
+ end
26
+ if (options.key?(:files))
27
+ options[:files].each do |file|
28
+ File.foreach(file) { |e| expressions << e }
29
+ end
30
+ end
31
+ expressions.each { |e| @cache[e.normalize] = true }
32
+ @expressions = TextNlp::Expressions.new(expressions)
33
+ end
34
+
35
+ def transform(text)
36
+ @expressions.expressionize(text).map { |expr| @cache.key?(expr) ? nil : expr }.compact.join(' ')
37
+ end
38
+
39
+ def size
40
+ @expressions.values.size
41
+ end
42
+
43
+ end
44
+ end
@@ -0,0 +1,43 @@
1
+ a
2
+ au
3
+ aussi
4
+ aux
5
+ avec
6
+ c
7
+ ce
8
+ cette
9
+ contre
10
+ d
11
+ dans
12
+ de
13
+ des
14
+ du
15
+ en
16
+ et
17
+ j
18
+ l
19
+ la
20
+ le
21
+ les
22
+ mais
23
+ n
24
+ ou
25
+ par
26
+ pas
27
+ pour
28
+ qu
29
+ que
30
+ quel
31
+ quelle
32
+ quelles
33
+ quels
34
+ qui
35
+ sa
36
+ sans
37
+ ses
38
+ son
39
+ sous
40
+ sur
41
+ un
42
+ une
43
+ y
@@ -18,23 +18,31 @@ class String
18
18
  self
19
19
  end
20
20
 
21
+ def normalize!
22
+ unless normalized()
23
+ replace(self.normalize)
24
+ self.normalized = true
25
+ end
26
+ self
27
+ end
28
+
21
29
  def tokenize
22
30
  (String.tokenizer || TextNlp::Tokenizer.new).tokenize(self)
23
31
  end
24
32
 
25
33
  def similarity(text)
26
34
  score = 0.0
27
- tokens1 = self.normalize.tokenize
28
- tokens2 = text.normalize.tokenize
35
+ tokens1, tokens2 = self.normalize.tokenize, text.normalize.tokenize
29
36
  if (tokens1.size > 0 && tokens2.size > 0)
30
37
  intersection = tokens1 & tokens2
31
- score = (((intersection.size.to_f / tokens1.size.to_f) + (intersection.size.to_f / tokens2.size.to_f)) / 2)
38
+ score = (((intersection.size.to_f / tokens1.size) + (intersection.size.to_f / tokens2.size)) / 2)
32
39
  end
33
40
  score
34
41
  end
35
42
 
36
- def translate(translator)
37
- translator.translate(self)
43
+ def transform(*transformers)
44
+ transformers = [transformers] unless transformers.respond_to?(:each)
45
+ transformers.flatten.inject(self) { |text,transformer| transformer.transform(text) }
38
46
  end
39
47
 
40
48
  end
@@ -13,15 +13,16 @@ class TextNlp
13
13
  end
14
14
 
15
15
  def register(name,synonyms)
16
- normalized_name = name.normalize
16
+ name.normalize!
17
17
  synonyms.each do |synonym|
18
+ synonym.normalize!
18
19
  @expressions << synonym
19
- @synonyms[synonym.normalize] = normalized_name
20
+ @synonyms[synonym] = name
20
21
  end
21
22
  end
22
23
 
23
- def translate(text)
24
- @expressions.expressionize(text).map { |expr| @synonyms.key?(expr) ? @synonyms[expr] : expr }.join(' ')
24
+ def transform(text)
25
+ @expressions.expressionize(text).map { |expr| @synonyms.key?(expr) ? @synonyms[expr] : expr }.compact.join(' ')
25
26
  end
26
27
 
27
28
  end
@@ -1,7 +1,7 @@
1
1
  # encoding: UTF-8
2
2
 
3
3
  class TextNlp
4
- class Tokenizer
4
+ class Tokenizer
5
5
  def tokenize(text)
6
6
  text.split(/\s+/)
7
7
  end
@@ -0,0 +1,2 @@
1
+ you
2
+ an
@@ -0,0 +1,3 @@
1
+ le
2
+ de
3
+ un
@@ -3,11 +3,31 @@ require "spec_helper"
3
3
 
4
4
  describe TextNlp::Pattern do
5
5
 
6
- it "should match or not the pattern" do
7
- TextNlp::Pattern.new("((bd)||(bande dessinée))&&!samsung").match?("cette bd est super").should be_true
8
- TextNlp::Pattern.new("((bd)||(bande dessinée))&&!samsung").match?("cette bd est illisible sur samsung NTC").should be_false
9
- TextNlp::Pattern.new("((bd)||(bande dessinée))&&!samsung").match?("cette bande dessinée est illisible sur samsung NTC").should be_false
10
- TextNlp::Pattern.new("((bd)||(bande dessinée))&&!samsung").match?("cette bande dessinée est illisible").should be_true
6
+ context "with normalize option" do
7
+
8
+ it "should match or not the pattern" do
9
+ pattern = TextNlp::Pattern.new("(BD OR 'bande dessinée') AND -samsung")
10
+ pattern.match?("cette BD est super").should be_true
11
+ pattern.match?("cette bd est illisible sur samsung NTC").should be_false
12
+ pattern.match?("cette bande dessinee est illisible sur samsung NTC").should be_false
13
+ pattern.match?("cette bande dessinee est illisible").should be_true
14
+ pattern = TextNlp::Pattern.new("'toulouse fc' OR ((toulouse OR tfc) AND (foot OR football OR 'ligue 1' OR 'ligue 2' OR l1 OR l2))")
15
+ pattern.match?("toulouse est une belle ville").should be_false
16
+ end
17
+
11
18
  end
12
19
 
20
+ context "with no normalized option" do
21
+
22
+ it "should match or not the pattern" do
23
+ pattern = TextNlp::Pattern.new("(BD OR 'bande dessinée') AND -samsung", :normalize => false)
24
+ pattern.match?("cette BD est super").should be_true
25
+ pattern.match?("cette bd est super").should be_false
26
+ pattern.match?("cette bande dessinee est illisible").should be_false
27
+ pattern.match?("cette bande dessinée est illisible").should be_true
28
+ pattern.match?("cette bande dessinée est illisible sur samsung").should be_false
29
+ end
30
+
31
+ end
32
+
13
33
  end
@@ -0,0 +1,34 @@
1
+ # encoding: utf-8
2
+ require "spec_helper"
3
+
4
+ describe TextNlp::StopList do
5
+
6
+ it "should remove the words/expressions defined by the stop list" do
7
+
8
+ TextNlp::StopList.directory = File.dirname(__FILE__)
9
+
10
+ stop_list = TextNlp::StopList.new(:expressions => ['il','a','ecrit par toto'])
11
+ stop_list.size.should eq 3
12
+ stop_list.transform("bordel Il fait chaud ici").should eq 'bordel fait chaud ici'
13
+ stop_list.transform("bordel Il fait chaud ici ecrit par toto").should eq 'bordel fait chaud ici'
14
+ stop_list.transform("bordel Il fait chaud ici ecrit par titi").should eq 'bordel fait chaud ici ecrit par titi'
15
+
16
+ stop_list = TextNlp::StopList.new(:expressions => ['il','a','ecrit par toto'], :file => File.join(File.dirname(__FILE__),"stop_list_toto.txt"))
17
+ stop_list.size.should eq 5
18
+ stop_list.transform("bordel Il fait chaud ici").should eq 'fait chaud ici'
19
+ stop_list.transform("bordel Il fait chaud ici ecrit par toto").should eq 'fait chaud ici'
20
+ stop_list.transform("bordel Il fait chaud ici ecrit par titi").should eq 'fait chaud ici ecrit par titi'
21
+
22
+ stop_list = TextNlp::StopList.new(:name => "min_fr")
23
+ stop_list.size.should eq 3
24
+ stop_list.transform("le ballon de zizou").should eq 'ballon zizou'
25
+
26
+ stop_list = TextNlp::StopList.new(:names => ["min_fr","min_en"])
27
+ stop_list.size.should eq 5
28
+
29
+ stop_list = TextNlp::StopList.new(
30
+ :files => [File.join(File.dirname(__FILE__),"stop_list_toto.txt"),File.join(File.dirname(__FILE__),"stop_list_tutu.txt")])
31
+ stop_list.size.should eq 4
32
+ end
33
+
34
+ end
@@ -0,0 +1,2 @@
1
+ zut
2
+ bordel
@@ -0,0 +1,2 @@
1
+ fsdfsdfsdf sdfdsf
2
+ eoirezoir uoi ioio
@@ -17,6 +17,16 @@ describe String do
17
17
  text.normalize.should eq "TOTO".downcase
18
18
  end
19
19
 
20
+ it "should normalize the receiver string" do
21
+ text = "TOTO"
22
+ normalizer = double()
23
+ String.normalizer = normalizer
24
+ normalizer.stub(:normalize) { |txt| txt.downcase }
25
+ text.normalize!
26
+ text.should eq "TOTO".downcase
27
+ text.normalized.should be_true
28
+ end
29
+
20
30
  it "should call tokenizer" do
21
31
  text = "TOTO"
22
32
  tokenizer = double()
@@ -25,11 +35,20 @@ describe String do
25
35
  text.tokenize
26
36
  end
27
37
 
28
- it "should call translator" do
38
+ it "should call translator / translators" do
29
39
  text = "TOTO"
30
- translator = double()
31
- translator.should_receive(:translate).with(text)
32
- text.translate(translator)
40
+ transformer1 = double()
41
+ transformer1.should_receive(:transform).with(text)
42
+ text.transform(transformer1)
43
+ transformer1 = double()
44
+ transformer1.stub(:transform) { |text| text.tr("T","U") }
45
+ transformer2 = double()
46
+ transformer2.stub(:transform) { |text| text.tr("O","A") }
47
+ transformer1.should_receive(:transform).with("TOTO")
48
+ transformer2.should_receive(:transform).with("UOUO")
49
+ text = text.transform(transformer1,transformer2)
50
+ text.should eq "UAUA"
51
+ text.transform([transformer1,transformer2])
33
52
  end
34
53
 
35
54
  it "should compute similarity" do
@@ -6,18 +6,20 @@ describe TextNlp::Synonyms do
6
6
  it "should synonymize the text" do
7
7
  synonyms = TextNlp::Synonyms.new
8
8
  synonyms.register("CAEN",["smc","sm caen","stade malherbe de caen"])
9
- synonyms.translate("le smc c est de la bombe").should eq "le caen c est de la bombe"
10
- synonyms.translate("le truc c est de la bombe").should eq "le truc c est de la bombe"
11
- synonyms.translate("le sm caen c est de la bombe").should eq "le caen c est de la bombe"
12
- synonyms.translate("le stade malherbe de caen c est de la bombe").should eq "le caen c est de la bombe"
9
+ synonyms.transform("le smc c est de la bombe").should eq "le caen c est de la bombe"
10
+ synonyms.transform("le truc c est de la bombe").should eq "le truc c est de la bombe"
11
+ synonyms.transform("le sm caen c est de la bombe").should eq "le caen c est de la bombe"
12
+ synonyms.transform("le stade malherbe de caen c est de la bombe").should eq "le caen c est de la bombe"
13
13
  end
14
14
 
15
15
  it "should synonymize the text" do
16
16
  synonyms = TextNlp::Synonyms.new([["CAEN","smc","sm caen","stade malherbe de caen"],["marseille","om"]])
17
- synonyms.translate("le smc c est de la bombe").should eq "le caen c est de la bombe"
18
- synonyms.translate("le truc c est de la bombe").should eq "le truc c est de la bombe"
19
- synonyms.translate("le sm caen c est de la bombe").should eq "le caen c est de la bombe"
20
- synonyms.translate("le stade malherbe de caen c est de la bombe").should eq "le caen c est de la bombe"
17
+ synonyms.transform("le smc c est de la bombe").should eq "le caen c est de la bombe"
18
+ synonyms.transform("le truc c est de la bombe").should eq "le truc c est de la bombe"
19
+ synonyms.transform("le sm caen c est de la bombe").should eq "le caen c est de la bombe"
20
+ synonyms.transform("le stade malherbe de caen c est de la bombe").should eq "le caen c est de la bombe"
21
+ synonyms.transform("le caen c est de la bombe").should eq "le caen c est de la bombe"
22
+ synonyms.transform("le om c est de la bombe").should eq "le marseille c est de la bombe"
21
23
  end
22
24
 
23
25
  end
@@ -1,9 +1,14 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = 'text_nlp'
3
- s.version = '0.0.2'
4
- s.date = '2011-07-05'
3
+ s.version = '0.0.3'
4
+ s.date = '2011-07-07'
5
5
  s.summary = "A minimalist NLP library"
6
6
  s.description = s.summary
7
+
8
+ s.add_dependency "textquery"
9
+ s.add_development_dependency "rspec"
10
+ s.add_development_dependency "rake"
11
+
7
12
  s.authors = ["fonzo14"]
8
13
  s.require_paths = ["lib"]
9
14
  s.files = `git ls-files`.split("\n")
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_nlp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,8 +9,41 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2011-07-05 00:00:00.000000000Z
13
- dependencies: []
12
+ date: 2011-07-07 00:00:00.000000000Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: textquery
16
+ requirement: &86270380 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: *86270380
25
+ - !ruby/object:Gem::Dependency
26
+ name: rspec
27
+ requirement: &86270160 !ruby/object:Gem::Requirement
28
+ none: false
29
+ requirements:
30
+ - - ! '>='
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ type: :development
34
+ prerelease: false
35
+ version_requirements: *86270160
36
+ - !ruby/object:Gem::Dependency
37
+ name: rake
38
+ requirement: &86269950 !ruby/object:Gem::Requirement
39
+ none: false
40
+ requirements:
41
+ - - ! '>='
42
+ - !ruby/object:Gem::Version
43
+ version: '0'
44
+ type: :development
45
+ prerelease: false
46
+ version_requirements: *86269950
14
47
  description: A minimalist NLP library
15
48
  email:
16
49
  executables: []
@@ -29,13 +62,20 @@ files:
29
62
  - lib/text_nlp/expressions.rb
30
63
  - lib/text_nlp/normalizer.rb
31
64
  - lib/text_nlp/pattern.rb
65
+ - lib/text_nlp/stop_list.rb
66
+ - lib/text_nlp/stoplists/min_fr.txt
32
67
  - lib/text_nlp/string.rb
33
68
  - lib/text_nlp/synonyms.rb
34
69
  - lib/text_nlp/tokenizer.rb
35
70
  - spec/expressions_spec.rb
71
+ - spec/min_en.txt
72
+ - spec/min_fr.txt
36
73
  - spec/normalizer_spec.rb
37
74
  - spec/pattern_spec.rb
38
75
  - spec/spec_helper.rb
76
+ - spec/stop_list_spec.rb
77
+ - spec/stop_list_toto.txt
78
+ - spec/stop_list_tutu.txt
39
79
  - spec/string_spec.rb
40
80
  - spec/synonyms_spec.rb
41
81
  - spec/tokenizer_spec.rb