text_nlp 0.0.2 → 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
data/Gemfile CHANGED
@@ -1,5 +1,7 @@
1
1
  source :rubygems
2
2
 
3
+ gem 'textquery'
4
+
3
5
  group :test do
4
6
  gem 'rspec'
5
7
  end
@@ -2,6 +2,7 @@ GEM
2
2
  remote: http://rubygems.org/
3
3
  specs:
4
4
  diff-lcs (1.1.2)
5
+ polyglot (0.3.1)
5
6
  rspec (2.6.0)
6
7
  rspec-core (~> 2.6.0)
7
8
  rspec-expectations (~> 2.6.0)
@@ -10,9 +11,14 @@ GEM
10
11
  rspec-expectations (2.6.0)
11
12
  diff-lcs (~> 1.1.2)
12
13
  rspec-mocks (2.6.0)
14
+ textquery (0.1.8)
15
+ treetop
16
+ treetop (1.4.9)
17
+ polyglot (>= 0.3.1)
13
18
 
14
19
  PLATFORMS
15
20
  ruby
16
21
 
17
22
  DEPENDENCIES
18
23
  rspec
24
+ textquery
@@ -8,6 +8,4 @@ require "text_nlp/string.rb"
8
8
  require "text_nlp/expressions.rb"
9
9
  require "text_nlp/pattern.rb"
10
10
  require "text_nlp/synonyms.rb"
11
-
12
- class TextNlp
13
- end
11
+ require "text_nlp/stop_list.rb"
@@ -12,7 +12,7 @@ class TextNlp
12
12
 
13
13
  def <<(expression)
14
14
  node = @root
15
- expression = expression.normalize
15
+ expression.normalize!
16
16
  @values << expression
17
17
  tokens = expression.tokenize
18
18
  tokens_count = tokens.size
@@ -40,7 +40,7 @@ class TextNlp
40
40
  end
41
41
 
42
42
  def find(text)
43
- find_expressions(0,text.normalize.tokenize.map { |t| t })
43
+ find_expressions(0,text.normalize.tokenize)
44
44
  end
45
45
 
46
46
  private
@@ -9,6 +9,6 @@ class TextNlp
9
9
  text.gsub!(/\s+/," ")
10
10
  text.strip!
11
11
  text
12
- end
12
+ end
13
13
  end
14
14
  end
@@ -1,122 +1,27 @@
1
1
  # encoding: UTF-8
2
+ require 'textquery'
2
3
 
3
4
  class TextNlp
4
5
  class Pattern
5
-
6
- attr_reader :root
7
-
8
- def initialize(root_or_string = nil)
9
- if (root_or_string.is_a?(String))
10
- @root = parse(root_or_string)
11
- else
12
- @root = root_or_string
6
+
7
+ def initialize(pattern, options = {})
8
+ options = {:normalize => true}.merge(options)
9
+ if options[:normalize]
10
+ normalize_pattern(pattern)
11
+ @to_normalize = true
13
12
  end
13
+ @text_query = TextQuery.new(pattern, {:ignorecase => options[:normalize]})
14
14
  end
15
-
16
- def <<(node)
17
- @root << node
18
- end
19
-
15
+
20
16
  def match?(text)
21
- @root.evaluate(text)
17
+ text.normalize! if @to_normalize
18
+ @text_query.match?(text)
22
19
  end
23
20
 
24
21
  private
25
- def parse(expr)
26
- operators = ['||','&&']
27
- current_expression, node, opened, closed = '', nil, 0, 0
28
- expr.chars.each_with_index do |char,i|
29
- if (char == '(')
30
- opened += 1
31
- current_expression << char if ((opened - closed) > 1)
32
- elsif (char == ')')
33
- closed += 1
34
- current_expression << char if ((opened - closed) > 0)
35
- elsif ((opened == closed) && (operators.include?(expr[i-1..i])))
36
- node = operator_node(expr[i-1..i])
37
- node << parse(current_expression[0..-2])
38
- node << parse(expr[i+1..-1])
39
- break;
40
- else
41
- current_expression << char
42
- end
43
- end
44
- unless node
45
- if (current_expression.match(/\|{2}|&{2}/))
46
- node = parse(current_expression)
47
- else
48
- node = current_expression[0..0] == '!' ? Not.new(current_expression[1..-1]) : Unary.new(current_expression)
49
- end
50
- end
51
- node
52
- end
53
-
54
- def operator_node(operator)
55
- node = case operator
56
- when '||' then Or.new
57
- when '&&' then And.new
58
- end
59
- node
60
- end
61
-
62
- class Composite
63
- attr_reader :nodes
64
-
65
- def initialize(*nodes)
66
- @nodes = nodes || []
67
- end
68
-
69
- def <<(node)
70
- @nodes << node
71
- end
72
-
73
- def values
74
- @nodes.map { |node| node.values }.flatten
75
- end
76
- end
77
-
78
- class And < Composite
79
- def evaluate(expr)
80
- @nodes.each do |node|
81
- return false unless node.evaluate(expr)
82
- end
83
- return true
84
- end
85
- end
86
-
87
- class Or < Composite
88
- def evaluate(expr)
89
- @nodes.each do |node|
90
- return true if node.evaluate(expr)
91
- end
92
- return false
93
- end
94
- end
95
-
96
- class Unary
97
- attr_reader :value
98
-
99
- def initialize(value)
100
- @value = value
101
- @expressions = Expressions.new([@value])
102
- end
103
-
104
- def evaluate(expr)
105
- @expressions.any?(expr)
106
- end
107
-
108
- def values
109
- [value]
110
- end
111
- end
112
-
113
- class Not < Unary
114
- def evaluate(expr)
115
- !super(expr)
116
- end
117
- def values
118
- []
119
- end
22
+ def normalize_pattern(pattern)
23
+ pattern.tr!("éèàçîêô","eeacieo")
24
+ pattern.tr!("!,;?./\\_|[]{}<>:*$%"," ")
120
25
  end
121
26
 
122
27
  end
@@ -0,0 +1,44 @@
1
+ # encoding: UTF-8
2
+
3
+ class TextNlp
4
+ class StopList
5
+
6
+ class << self
7
+ attr_accessor :directory
8
+ StopList.directory = File.join(File.dirname(__FILE__),'stoplists')
9
+ end
10
+
11
+ def initialize(options = {})
12
+ @cache = {}
13
+ options = {:expressions => []}.merge(options)
14
+ expressions = options[:expressions]
15
+ if (options.key?(:name))
16
+ File.foreach(File.join(StopList.directory,"#{options[:name]}.txt")) { |e| expressions << e }
17
+ end
18
+ if (options.key?(:names))
19
+ options[:names].each do |name|
20
+ File.foreach(File.join(StopList.directory,"#{name}.txt")) { |e| expressions << e }
21
+ end
22
+ end
23
+ if (options.key?(:file))
24
+ File.foreach(options[:file]) { |e| expressions << e }
25
+ end
26
+ if (options.key?(:files))
27
+ options[:files].each do |file|
28
+ File.foreach(file) { |e| expressions << e }
29
+ end
30
+ end
31
+ expressions.each { |e| @cache[e.normalize] = true }
32
+ @expressions = TextNlp::Expressions.new(expressions)
33
+ end
34
+
35
+ def transform(text)
36
+ @expressions.expressionize(text).map { |expr| @cache.key?(expr) ? nil : expr }.compact.join(' ')
37
+ end
38
+
39
+ def size
40
+ @expressions.values.size
41
+ end
42
+
43
+ end
44
+ end
@@ -0,0 +1,43 @@
1
+ a
2
+ au
3
+ aussi
4
+ aux
5
+ avec
6
+ c
7
+ ce
8
+ cette
9
+ contre
10
+ d
11
+ dans
12
+ de
13
+ des
14
+ du
15
+ en
16
+ et
17
+ j
18
+ l
19
+ la
20
+ le
21
+ les
22
+ mais
23
+ n
24
+ ou
25
+ par
26
+ pas
27
+ pour
28
+ qu
29
+ que
30
+ quel
31
+ quelle
32
+ quelles
33
+ quels
34
+ qui
35
+ sa
36
+ sans
37
+ ses
38
+ son
39
+ sous
40
+ sur
41
+ un
42
+ une
43
+ y
@@ -18,23 +18,31 @@ class String
18
18
  self
19
19
  end
20
20
 
21
+ def normalize!
22
+ unless normalized()
23
+ replace(self.normalize)
24
+ self.normalized = true
25
+ end
26
+ self
27
+ end
28
+
21
29
  def tokenize
22
30
  (String.tokenizer || TextNlp::Tokenizer.new).tokenize(self)
23
31
  end
24
32
 
25
33
  def similarity(text)
26
34
  score = 0.0
27
- tokens1 = self.normalize.tokenize
28
- tokens2 = text.normalize.tokenize
35
+ tokens1, tokens2 = self.normalize.tokenize, text.normalize.tokenize
29
36
  if (tokens1.size > 0 && tokens2.size > 0)
30
37
  intersection = tokens1 & tokens2
31
- score = (((intersection.size.to_f / tokens1.size.to_f) + (intersection.size.to_f / tokens2.size.to_f)) / 2)
38
+ score = (((intersection.size.to_f / tokens1.size) + (intersection.size.to_f / tokens2.size)) / 2)
32
39
  end
33
40
  score
34
41
  end
35
42
 
36
- def translate(translator)
37
- translator.translate(self)
43
+ def transform(*transformers)
44
+ transformers = [transformers] unless transformers.respond_to?(:each)
45
+ transformers.flatten.inject(self) { |text,transformer| transformer.transform(text) }
38
46
  end
39
47
 
40
48
  end
@@ -13,15 +13,16 @@ class TextNlp
13
13
  end
14
14
 
15
15
  def register(name,synonyms)
16
- normalized_name = name.normalize
16
+ name.normalize!
17
17
  synonyms.each do |synonym|
18
+ synonym.normalize!
18
19
  @expressions << synonym
19
- @synonyms[synonym.normalize] = normalized_name
20
+ @synonyms[synonym] = name
20
21
  end
21
22
  end
22
23
 
23
- def translate(text)
24
- @expressions.expressionize(text).map { |expr| @synonyms.key?(expr) ? @synonyms[expr] : expr }.join(' ')
24
+ def transform(text)
25
+ @expressions.expressionize(text).map { |expr| @synonyms.key?(expr) ? @synonyms[expr] : expr }.compact.join(' ')
25
26
  end
26
27
 
27
28
  end
@@ -1,7 +1,7 @@
1
1
  # encoding: UTF-8
2
2
 
3
3
  class TextNlp
4
- class Tokenizer
4
+ class Tokenizer
5
5
  def tokenize(text)
6
6
  text.split(/\s+/)
7
7
  end
@@ -0,0 +1,2 @@
1
+ you
2
+ an
@@ -0,0 +1,3 @@
1
+ le
2
+ de
3
+ un
@@ -3,11 +3,31 @@ require "spec_helper"
3
3
 
4
4
  describe TextNlp::Pattern do
5
5
 
6
- it "should match or not the pattern" do
7
- TextNlp::Pattern.new("((bd)||(bande dessinée))&&!samsung").match?("cette bd est super").should be_true
8
- TextNlp::Pattern.new("((bd)||(bande dessinée))&&!samsung").match?("cette bd est illisible sur samsung NTC").should be_false
9
- TextNlp::Pattern.new("((bd)||(bande dessinée))&&!samsung").match?("cette bande dessinée est illisible sur samsung NTC").should be_false
10
- TextNlp::Pattern.new("((bd)||(bande dessinée))&&!samsung").match?("cette bande dessinée est illisible").should be_true
6
+ context "with normalize option" do
7
+
8
+ it "should match or not the pattern" do
9
+ pattern = TextNlp::Pattern.new("(BD OR 'bande dessinée') AND -samsung")
10
+ pattern.match?("cette BD est super").should be_true
11
+ pattern.match?("cette bd est illisible sur samsung NTC").should be_false
12
+ pattern.match?("cette bande dessinee est illisible sur samsung NTC").should be_false
13
+ pattern.match?("cette bande dessinee est illisible").should be_true
14
+ pattern = TextNlp::Pattern.new("'toulouse fc' OR ((toulouse OR tfc) AND (foot OR football OR 'ligue 1' OR 'ligue 2' OR l1 OR l2))")
15
+ pattern.match?("toulouse est une belle ville").should be_false
16
+ end
17
+
11
18
  end
12
19
 
20
+ context "with no normalized option" do
21
+
22
+ it "should match or not the pattern" do
23
+ pattern = TextNlp::Pattern.new("(BD OR 'bande dessinée') AND -samsung", :normalize => false)
24
+ pattern.match?("cette BD est super").should be_true
25
+ pattern.match?("cette bd est super").should be_false
26
+ pattern.match?("cette bande dessinee est illisible").should be_false
27
+ pattern.match?("cette bande dessinée est illisible").should be_true
28
+ pattern.match?("cette bande dessinée est illisible sur samsung").should be_false
29
+ end
30
+
31
+ end
32
+
13
33
  end
@@ -0,0 +1,34 @@
1
+ # encoding: utf-8
2
+ require "spec_helper"
3
+
4
+ describe TextNlp::StopList do
5
+
6
+ it "should remove the words/expressions defined by the stop list" do
7
+
8
+ TextNlp::StopList.directory = File.dirname(__FILE__)
9
+
10
+ stop_list = TextNlp::StopList.new(:expressions => ['il','a','ecrit par toto'])
11
+ stop_list.size.should eq 3
12
+ stop_list.transform("bordel Il fait chaud ici").should eq 'bordel fait chaud ici'
13
+ stop_list.transform("bordel Il fait chaud ici ecrit par toto").should eq 'bordel fait chaud ici'
14
+ stop_list.transform("bordel Il fait chaud ici ecrit par titi").should eq 'bordel fait chaud ici ecrit par titi'
15
+
16
+ stop_list = TextNlp::StopList.new(:expressions => ['il','a','ecrit par toto'], :file => File.join(File.dirname(__FILE__),"stop_list_toto.txt"))
17
+ stop_list.size.should eq 5
18
+ stop_list.transform("bordel Il fait chaud ici").should eq 'fait chaud ici'
19
+ stop_list.transform("bordel Il fait chaud ici ecrit par toto").should eq 'fait chaud ici'
20
+ stop_list.transform("bordel Il fait chaud ici ecrit par titi").should eq 'fait chaud ici ecrit par titi'
21
+
22
+ stop_list = TextNlp::StopList.new(:name => "min_fr")
23
+ stop_list.size.should eq 3
24
+ stop_list.transform("le ballon de zizou").should eq 'ballon zizou'
25
+
26
+ stop_list = TextNlp::StopList.new(:names => ["min_fr","min_en"])
27
+ stop_list.size.should eq 5
28
+
29
+ stop_list = TextNlp::StopList.new(
30
+ :files => [File.join(File.dirname(__FILE__),"stop_list_toto.txt"),File.join(File.dirname(__FILE__),"stop_list_tutu.txt")])
31
+ stop_list.size.should eq 4
32
+ end
33
+
34
+ end
@@ -0,0 +1,2 @@
1
+ zut
2
+ bordel
@@ -0,0 +1,2 @@
1
+ fsdfsdfsdf sdfdsf
2
+ eoirezoir uoi ioio
@@ -17,6 +17,16 @@ describe String do
17
17
  text.normalize.should eq "TOTO".downcase
18
18
  end
19
19
 
20
+ it "should normalize the receiver string" do
21
+ text = "TOTO"
22
+ normalizer = double()
23
+ String.normalizer = normalizer
24
+ normalizer.stub(:normalize) { |txt| txt.downcase }
25
+ text.normalize!
26
+ text.should eq "TOTO".downcase
27
+ text.normalized.should be_true
28
+ end
29
+
20
30
  it "should call tokenizer" do
21
31
  text = "TOTO"
22
32
  tokenizer = double()
@@ -25,11 +35,20 @@ describe String do
25
35
  text.tokenize
26
36
  end
27
37
 
28
- it "should call translator" do
38
+ it "should call translator / translators" do
29
39
  text = "TOTO"
30
- translator = double()
31
- translator.should_receive(:translate).with(text)
32
- text.translate(translator)
40
+ transformer1 = double()
41
+ transformer1.should_receive(:transform).with(text)
42
+ text.transform(transformer1)
43
+ transformer1 = double()
44
+ transformer1.stub(:transform) { |text| text.tr("T","U") }
45
+ transformer2 = double()
46
+ transformer2.stub(:transform) { |text| text.tr("O","A") }
47
+ transformer1.should_receive(:transform).with("TOTO")
48
+ transformer2.should_receive(:transform).with("UOUO")
49
+ text = text.transform(transformer1,transformer2)
50
+ text.should eq "UAUA"
51
+ text.transform([transformer1,transformer2])
33
52
  end
34
53
 
35
54
  it "should compute similarity" do
@@ -6,18 +6,20 @@ describe TextNlp::Synonyms do
6
6
  it "should synonymize the text" do
7
7
  synonyms = TextNlp::Synonyms.new
8
8
  synonyms.register("CAEN",["smc","sm caen","stade malherbe de caen"])
9
- synonyms.translate("le smc c est de la bombe").should eq "le caen c est de la bombe"
10
- synonyms.translate("le truc c est de la bombe").should eq "le truc c est de la bombe"
11
- synonyms.translate("le sm caen c est de la bombe").should eq "le caen c est de la bombe"
12
- synonyms.translate("le stade malherbe de caen c est de la bombe").should eq "le caen c est de la bombe"
9
+ synonyms.transform("le smc c est de la bombe").should eq "le caen c est de la bombe"
10
+ synonyms.transform("le truc c est de la bombe").should eq "le truc c est de la bombe"
11
+ synonyms.transform("le sm caen c est de la bombe").should eq "le caen c est de la bombe"
12
+ synonyms.transform("le stade malherbe de caen c est de la bombe").should eq "le caen c est de la bombe"
13
13
  end
14
14
 
15
15
  it "should synonymize the text" do
16
16
  synonyms = TextNlp::Synonyms.new([["CAEN","smc","sm caen","stade malherbe de caen"],["marseille","om"]])
17
- synonyms.translate("le smc c est de la bombe").should eq "le caen c est de la bombe"
18
- synonyms.translate("le truc c est de la bombe").should eq "le truc c est de la bombe"
19
- synonyms.translate("le sm caen c est de la bombe").should eq "le caen c est de la bombe"
20
- synonyms.translate("le stade malherbe de caen c est de la bombe").should eq "le caen c est de la bombe"
17
+ synonyms.transform("le smc c est de la bombe").should eq "le caen c est de la bombe"
18
+ synonyms.transform("le truc c est de la bombe").should eq "le truc c est de la bombe"
19
+ synonyms.transform("le sm caen c est de la bombe").should eq "le caen c est de la bombe"
20
+ synonyms.transform("le stade malherbe de caen c est de la bombe").should eq "le caen c est de la bombe"
21
+ synonyms.transform("le caen c est de la bombe").should eq "le caen c est de la bombe"
22
+ synonyms.transform("le om c est de la bombe").should eq "le marseille c est de la bombe"
21
23
  end
22
24
 
23
25
  end
@@ -1,9 +1,14 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = 'text_nlp'
3
- s.version = '0.0.2'
4
- s.date = '2011-07-05'
3
+ s.version = '0.0.3'
4
+ s.date = '2011-07-07'
5
5
  s.summary = "A minimalist NLP library"
6
6
  s.description = s.summary
7
+
8
+ s.add_dependency "textquery"
9
+ s.add_development_dependency "rspec"
10
+ s.add_development_dependency "rake"
11
+
7
12
  s.authors = ["fonzo14"]
8
13
  s.require_paths = ["lib"]
9
14
  s.files = `git ls-files`.split("\n")
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_nlp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,8 +9,41 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2011-07-05 00:00:00.000000000Z
13
- dependencies: []
12
+ date: 2011-07-07 00:00:00.000000000Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: textquery
16
+ requirement: &86270380 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: *86270380
25
+ - !ruby/object:Gem::Dependency
26
+ name: rspec
27
+ requirement: &86270160 !ruby/object:Gem::Requirement
28
+ none: false
29
+ requirements:
30
+ - - ! '>='
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ type: :development
34
+ prerelease: false
35
+ version_requirements: *86270160
36
+ - !ruby/object:Gem::Dependency
37
+ name: rake
38
+ requirement: &86269950 !ruby/object:Gem::Requirement
39
+ none: false
40
+ requirements:
41
+ - - ! '>='
42
+ - !ruby/object:Gem::Version
43
+ version: '0'
44
+ type: :development
45
+ prerelease: false
46
+ version_requirements: *86269950
14
47
  description: A minimalist NLP library
15
48
  email:
16
49
  executables: []
@@ -29,13 +62,20 @@ files:
29
62
  - lib/text_nlp/expressions.rb
30
63
  - lib/text_nlp/normalizer.rb
31
64
  - lib/text_nlp/pattern.rb
65
+ - lib/text_nlp/stop_list.rb
66
+ - lib/text_nlp/stoplists/min_fr.txt
32
67
  - lib/text_nlp/string.rb
33
68
  - lib/text_nlp/synonyms.rb
34
69
  - lib/text_nlp/tokenizer.rb
35
70
  - spec/expressions_spec.rb
71
+ - spec/min_en.txt
72
+ - spec/min_fr.txt
36
73
  - spec/normalizer_spec.rb
37
74
  - spec/pattern_spec.rb
38
75
  - spec/spec_helper.rb
76
+ - spec/stop_list_spec.rb
77
+ - spec/stop_list_toto.txt
78
+ - spec/stop_list_tutu.txt
39
79
  - spec/string_spec.rb
40
80
  - spec/synonyms_spec.rb
41
81
  - spec/tokenizer_spec.rb