text_nlp 0.0.0 → 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore CHANGED
@@ -7,4 +7,5 @@ tmp/**/*
7
7
  bin/*
8
8
  vendor/gems/*
9
9
  !vendor/gems/cache/
10
- .sass-cache/*
10
+ .sass-cache/*
11
+ *.gem
@@ -0,0 +1,75 @@
1
+ # encoding: UTF-8
2
+
3
+ class TextNlp
4
+ class Expressions
5
+
6
+ attr_accessor :values
7
+
8
+ def initialize(expressions = [])
9
+ @root, @values = {}, []
10
+ expressions.each { |expr| self << expr }
11
+ end
12
+
13
+ def <<(expression)
14
+ node = @root
15
+ expression = expression.normalize
16
+ @values << expression
17
+ tokens = expression.tokenize
18
+ tokens_count = tokens.size
19
+ tokens.each_with_index do |token,i|
20
+ unless node.key?(token)
21
+ node[token] = {}
22
+ node[token][:parent] = node
23
+ end
24
+ node = node[token]
25
+ if (i == (tokens_count-1)) # leaf
26
+ node[:leaf] = 1
27
+ end
28
+ end
29
+ end
30
+
31
+ def any?(text)
32
+ (find(text).size > 0)
33
+ end
34
+
35
+ def expressionize(text)
36
+ expressions = find(text).sort { |e1,e2| e2.tokenize.size <=> e1.tokenize.size }
37
+ text = text.tokenize.join(',')
38
+ expressions.each { |expr| text.gsub!(expr.tokenize.join(','), expr) }
39
+ text.split(',')
40
+ end
41
+
42
+ def find(text)
43
+ find_expressions(0,text.normalize.tokenize.map { |t| t })
44
+ end
45
+
46
+ private
47
+ def find_expressions(start_index, tokens, expressions = [])
48
+ node, leaf, expr = @root, false, []
49
+ tokens[start_index..-1].each_with_index do |token,i|
50
+ if (node.key?(token))
51
+ node = node[token]
52
+ expr << token
53
+ else
54
+ while (expr.size > 0 && node)
55
+ if node.key?(:leaf)
56
+ expressions << expr.join(' ')
57
+ break
58
+ end
59
+ expr.pop
60
+ node = node[:parent]
61
+ end
62
+ break
63
+ end
64
+ end
65
+ start_index = expr.size > 0 ? (start_index + expr.size) : (start_index + 1)
66
+ if (start_index <= (tokens.size - 1))
67
+ find_expressions(start_index, tokens, expressions)
68
+ else
69
+ expressions << expr.join(' ') if (expr.size > 0 && node.key?(:leaf) )
70
+ end
71
+ expressions
72
+ end
73
+
74
+ end
75
+ end
@@ -1,9 +1,7 @@
1
1
  # encoding: UTF-8
2
2
 
3
3
  class TextNlp
4
- class Normalizer
5
- def initialize
6
- end
4
+ class Normalizer
7
5
  def normalize(text)
8
6
  text.downcase!
9
7
  text.tr!("éèàçîêô","eeacieo")
@@ -0,0 +1,123 @@
1
+ # encoding: UTF-8
2
+
3
+ class TextNlp
4
+ class Pattern
5
+
6
+ attr_reader :root
7
+
8
+ def initialize(root_or_string = nil)
9
+ if (root_or_string.is_a?(String))
10
+ @root = parse(root_or_string)
11
+ else
12
+ @root = root_or_string
13
+ end
14
+ end
15
+
16
+ def <<(node)
17
+ @root << node
18
+ end
19
+
20
+ def match?(text)
21
+ @root.evaluate(text)
22
+ end
23
+
24
+ private
25
+ def parse(expr)
26
+ operators = ['||','&&']
27
+ current_expression, node, opened, closed = '', nil, 0, 0
28
+ expr.chars.each_with_index do |char,i|
29
+ if (char == '(')
30
+ opened += 1
31
+ current_expression << char if ((opened - closed) > 1)
32
+ elsif (char == ')')
33
+ closed += 1
34
+ current_expression << char if ((opened - closed) > 0)
35
+ elsif ((opened == closed) && (operators.include?(expr[i-1..i])))
36
+ node = operator_node(expr[i-1..i])
37
+ node << parse(current_expression[0..-2])
38
+ node << parse(expr[i+1..-1])
39
+ break;
40
+ else
41
+ current_expression << char
42
+ end
43
+ end
44
+ unless node
45
+ if (current_expression.match(/\|{2}|&{2}/))
46
+ node = parse(current_expression)
47
+ else
48
+ node = current_expression[0..0] == '!' ? Not.new(current_expression[1..-1]) : Unary.new(current_expression)
49
+ end
50
+ end
51
+ node
52
+ end
53
+
54
+ def operator_node(operator)
55
+ node = case operator
56
+ when '||' then Or.new
57
+ when '&&' then And.new
58
+ end
59
+ node
60
+ end
61
+
62
+ class Composite
63
+ attr_reader :nodes
64
+
65
+ def initialize(*nodes)
66
+ @nodes = nodes || []
67
+ end
68
+
69
+ def <<(node)
70
+ @nodes << node
71
+ end
72
+
73
+ def values
74
+ @nodes.map { |node| node.values }.flatten
75
+ end
76
+ end
77
+
78
+ class And < Composite
79
+ def evaluate(expr)
80
+ @nodes.each do |node|
81
+ return false unless node.evaluate(expr)
82
+ end
83
+ return true
84
+ end
85
+ end
86
+
87
+ class Or < Composite
88
+ def evaluate(expr)
89
+ @nodes.each do |node|
90
+ return true if node.evaluate(expr)
91
+ end
92
+ return false
93
+ end
94
+ end
95
+
96
+ class Unary
97
+ attr_reader :value
98
+
99
+ def initialize(value)
100
+ @value = value
101
+ @expressions = Expressions.new([@value])
102
+ end
103
+
104
+ def evaluate(expr)
105
+ @expressions.any?(expr)
106
+ end
107
+
108
+ def values
109
+ [value]
110
+ end
111
+ end
112
+
113
+ class Not < Unary
114
+ def evaluate(expr)
115
+ !super(expr)
116
+ end
117
+ def values
118
+ []
119
+ end
120
+ end
121
+
122
+ end
123
+ end
@@ -0,0 +1,36 @@
1
+ # encoding: UTF-8
2
+
3
+ class String
4
+
5
+ attr_accessor :normalized
6
+
7
+ class << self
8
+ attr_accessor :normalizer
9
+ attr_accessor :tokenizer
10
+ end
11
+
12
+ def normalize
13
+ unless normalized()
14
+ new_string = (String.normalizer || TextNlp::Normalizer.new).normalize(self)
15
+ new_string.normalized = true
16
+ return new_string
17
+ end
18
+ self
19
+ end
20
+
21
+ def tokenize
22
+ (String.tokenizer || TextNlp::Tokenizer.new).tokenize(self)
23
+ end
24
+
25
+ def similarity(text)
26
+ score = 0.0
27
+ tokens1 = self.normalize.tokenize
28
+ tokens2 = text.normalize.tokenize
29
+ if (tokens1.size > 0 && tokens2.size > 0)
30
+ intersection = tokens1 & tokens2
31
+ score = (((intersection.size.to_f / tokens1.size.to_f) + (intersection.size.to_f / tokens2.size.to_f)) / 2)
32
+ end
33
+ score
34
+ end
35
+
36
+ end
@@ -0,0 +1,9 @@
1
+ # encoding: UTF-8
2
+
3
+ class TextNlp
4
+ class Tokenizer
5
+ def tokenize(text)
6
+ text.split(/\s+/)
7
+ end
8
+ end
9
+ end
data/lib/text_nlp.rb CHANGED
@@ -3,6 +3,10 @@
3
3
  $:.unshift(File.dirname(__FILE__) + '/../lib')
4
4
 
5
5
  require "text_nlp/normalizer.rb"
6
+ require "text_nlp/tokenizer.rb"
7
+ require "text_nlp/string.rb"
8
+ require "text_nlp/expressions.rb"
9
+ require "text_nlp/pattern.rb"
6
10
 
7
11
  class TextNlp
8
12
  end
@@ -0,0 +1,33 @@
1
+ # encoding: utf-8
2
+ require "spec_helper"
3
+
4
+ describe TextNlp::Expressions do
5
+
6
+ it "should find the expressions" do
7
+ expression_values = ['nicolas sarkozy','nicolas sarkozy 1er de france','carla bruni','carla bruni sarkozy a']
8
+ etree = TextNlp::Expressions.new(expression_values)
9
+ expressions = etree.find('nicolas sarkozy 1er de italie est marie a carla bruni qui de fait est devenue carla bruni sarkozy a')
10
+ expressions.size.should eq 3
11
+ ['nicolas sarkozy','carla bruni','carla bruni sarkozy a'].each { |e| expressions.include?(e).should be_true }
12
+ expression_values.size.should eq etree.values.size
13
+ expression_values.each do |v|
14
+ etree.values.include?(v).should be_true
15
+ end
16
+ end
17
+
18
+ it "should expressionize the text" do
19
+ expression_values = ['nicolas sarkozy','nicolas sarkozy 1er de france','carla bruni','carla bruni sarkozy a']
20
+ etree = TextNlp::Expressions.new(expression_values)
21
+ expressions = etree.expressionize('nicolas sarkozy 1er de italie est marie a carla bruni qui de fait est devenue carla bruni sarkozy a')
22
+ expressions.should eq ['nicolas sarkozy','1er','de','italie','est','marie','a','carla bruni','qui','de','fait','est','devenue','carla bruni sarkozy a']
23
+ ['nicolas sarkozy','1er','de','italie','est','marie','a','carla bruni','qui','de','fait','est','devenue','carla bruni sarkozy a'].each { |e| expressions.include?(e).should be_true }
24
+ end
25
+
26
+ it "should returns true or false if any expression present in text" do
27
+ etree = TextNlp::Expressions.new(['olympique de marseille','lyon'])
28
+ etree.any?("l olympique de marseille").should be_true
29
+ etree.any?("lyon c est plus ce que c etait").should be_true
30
+ etree.any?("marseille pres du vieux port").should be_false
31
+ end
32
+
33
+ end
@@ -0,0 +1,13 @@
1
+ # encoding: utf-8
2
+ require "spec_helper"
3
+
4
+ describe TextNlp::Pattern do
5
+
6
+ it "should match or not the pattern" do
7
+ TextNlp::Pattern.new("((bd)||(bande dessinée))&&!samsung").match?("cette bd est super").should be_true
8
+ TextNlp::Pattern.new("((bd)||(bande dessinée))&&!samsung").match?("cette bd est illisible sur samsung NTC").should be_false
9
+ TextNlp::Pattern.new("((bd)||(bande dessinée))&&!samsung").match?("cette bande dessinée est illisible sur samsung NTC").should be_false
10
+ TextNlp::Pattern.new("((bd)||(bande dessinée))&&!samsung").match?("cette bande dessinée est illisible").should be_true
11
+ end
12
+
13
+ end
@@ -0,0 +1,36 @@
1
+ # encoding: utf-8
2
+ require "spec_helper"
3
+
4
+ describe String do
5
+
6
+ before(:each) { String.normalizer = nil; String.tokenizer = nil; }
7
+
8
+ it "should call normalizer" do
9
+ text = "TOTO"
10
+ normalizer = double()
11
+ String.normalizer = normalizer
12
+ normalizer.stub(:normalize) { |txt| txt.downcase }
13
+ normalizer.should_receive(:normalize).with(text)
14
+ text = text.normalize
15
+ text.should eq "TOTO".downcase
16
+ normalizer.should_not_receive(:normalize).with(text)
17
+ text.normalize.should eq "TOTO".downcase
18
+ end
19
+
20
+ it "should call tokenizer" do
21
+ text = "TOTO"
22
+ tokenizer = double()
23
+ String.tokenizer = tokenizer
24
+ tokenizer.should_receive(:tokenize).with(text)
25
+ text.tokenize
26
+ end
27
+
28
+ it "should compute similarity" do
29
+ "il fait chaud".similarity("il fait chaud").should eq 1.0
30
+ "il fait chaud".similarity("putin c nul ici").should eq 0.0
31
+ "il fait chaud".similarity("youhou ca le fait").should be_within(0.01).of(0.29)
32
+ "".similarity("il fait chaud").should eq 0.0
33
+ "il fait chaud".similarity("").should eq 0.0
34
+ end
35
+
36
+ end
@@ -0,0 +1,11 @@
1
+ # encoding: utf-8
2
+ require "spec_helper"
3
+
4
+ describe TextNlp::Tokenizer do
5
+
6
+ it "should tokenize text" do
7
+ n = TextNlp::Tokenizer.new
8
+ n.tokenize("Comment q'ça se fait ? blabla?").should eq ["Comment","q'ça","se","fait","?","blabla?"]
9
+ end
10
+
11
+ end
data/text_nlp.gemspec CHANGED
@@ -1,6 +1,6 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = 'text_nlp'
3
- s.version = '0.0.0'
3
+ s.version = '0.0.1'
4
4
  s.date = '2011-07-05'
5
5
  s.summary = "A minimalist NLP library"
6
6
  s.description = s.summary
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_nlp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.0
4
+ version: 0.0.1
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -26,10 +26,17 @@ files:
26
26
  - README
27
27
  - Rakefile
28
28
  - lib/text_nlp.rb
29
+ - lib/text_nlp/expressions.rb
29
30
  - lib/text_nlp/normalizer.rb
31
+ - lib/text_nlp/pattern.rb
32
+ - lib/text_nlp/string.rb
33
+ - lib/text_nlp/tokenizer.rb
34
+ - spec/expressions_spec.rb
30
35
  - spec/normalizer_spec.rb
36
+ - spec/pattern_spec.rb
31
37
  - spec/spec_helper.rb
32
- - text_nlp-0.0.0.gem
38
+ - spec/string_spec.rb
39
+ - spec/tokenizer_spec.rb
33
40
  - text_nlp.gemspec
34
41
  homepage: http://github.com/fonzo14/text_nlp
35
42
  licenses: []