text_nlp 0.0.0 → 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore CHANGED
@@ -7,4 +7,5 @@ tmp/**/*
7
7
  bin/*
8
8
  vendor/gems/*
9
9
  !vendor/gems/cache/
10
- .sass-cache/*
10
+ .sass-cache/*
11
+ *.gem
@@ -0,0 +1,75 @@
1
+ # encoding: UTF-8
2
+
3
+ class TextNlp
4
+ class Expressions
5
+
6
+ attr_accessor :values
7
+
8
+ def initialize(expressions = [])
9
+ @root, @values = {}, []
10
+ expressions.each { |expr| self << expr }
11
+ end
12
+
13
+ def <<(expression)
14
+ node = @root
15
+ expression = expression.normalize
16
+ @values << expression
17
+ tokens = expression.tokenize
18
+ tokens_count = tokens.size
19
+ tokens.each_with_index do |token,i|
20
+ unless node.key?(token)
21
+ node[token] = {}
22
+ node[token][:parent] = node
23
+ end
24
+ node = node[token]
25
+ if (i == (tokens_count-1)) # leaf
26
+ node[:leaf] = 1
27
+ end
28
+ end
29
+ end
30
+
31
+ def any?(text)
32
+ (find(text).size > 0)
33
+ end
34
+
35
+ def expressionize(text)
36
+ expressions = find(text).sort { |e1,e2| e2.tokenize.size <=> e1.tokenize.size }
37
+ text = text.tokenize.join(',')
38
+ expressions.each { |expr| text.gsub!(expr.tokenize.join(','), expr) }
39
+ text.split(',')
40
+ end
41
+
42
+ def find(text)
43
+ find_expressions(0,text.normalize.tokenize.map { |t| t })
44
+ end
45
+
46
+ private
47
+ def find_expressions(start_index, tokens, expressions = [])
48
+ node, leaf, expr = @root, false, []
49
+ tokens[start_index..-1].each_with_index do |token,i|
50
+ if (node.key?(token))
51
+ node = node[token]
52
+ expr << token
53
+ else
54
+ while (expr.size > 0 && node)
55
+ if node.key?(:leaf)
56
+ expressions << expr.join(' ')
57
+ break
58
+ end
59
+ expr.pop
60
+ node = node[:parent]
61
+ end
62
+ break
63
+ end
64
+ end
65
+ start_index = expr.size > 0 ? (start_index + expr.size) : (start_index + 1)
66
+ if (start_index <= (tokens.size - 1))
67
+ find_expressions(start_index, tokens, expressions)
68
+ else
69
+ expressions << expr.join(' ') if (expr.size > 0 && node.key?(:leaf) )
70
+ end
71
+ expressions
72
+ end
73
+
74
+ end
75
+ end
@@ -1,9 +1,7 @@
1
1
  # encoding: UTF-8
2
2
 
3
3
  class TextNlp
4
- class Normalizer
5
- def initialize
6
- end
4
+ class Normalizer
7
5
  def normalize(text)
8
6
  text.downcase!
9
7
  text.tr!("éèàçîêô","eeacieo")
@@ -0,0 +1,123 @@
1
+ # encoding: UTF-8
2
+
3
+ class TextNlp
4
+ class Pattern
5
+
6
+ attr_reader :root
7
+
8
+ def initialize(root_or_string = nil)
9
+ if (root_or_string.is_a?(String))
10
+ @root = parse(root_or_string)
11
+ else
12
+ @root = root_or_string
13
+ end
14
+ end
15
+
16
+ def <<(node)
17
+ @root << node
18
+ end
19
+
20
+ def match?(text)
21
+ @root.evaluate(text)
22
+ end
23
+
24
+ private
25
+ def parse(expr)
26
+ operators = ['||','&&']
27
+ current_expression, node, opened, closed = '', nil, 0, 0
28
+ expr.chars.each_with_index do |char,i|
29
+ if (char == '(')
30
+ opened += 1
31
+ current_expression << char if ((opened - closed) > 1)
32
+ elsif (char == ')')
33
+ closed += 1
34
+ current_expression << char if ((opened - closed) > 0)
35
+ elsif ((opened == closed) && (operators.include?(expr[i-1..i])))
36
+ node = operator_node(expr[i-1..i])
37
+ node << parse(current_expression[0..-2])
38
+ node << parse(expr[i+1..-1])
39
+ break;
40
+ else
41
+ current_expression << char
42
+ end
43
+ end
44
+ unless node
45
+ if (current_expression.match(/\|{2}|&{2}/))
46
+ node = parse(current_expression)
47
+ else
48
+ node = current_expression[0..0] == '!' ? Not.new(current_expression[1..-1]) : Unary.new(current_expression)
49
+ end
50
+ end
51
+ node
52
+ end
53
+
54
+ def operator_node(operator)
55
+ node = case operator
56
+ when '||' then Or.new
57
+ when '&&' then And.new
58
+ end
59
+ node
60
+ end
61
+
62
+ class Composite
63
+ attr_reader :nodes
64
+
65
+ def initialize(*nodes)
66
+ @nodes = nodes || []
67
+ end
68
+
69
+ def <<(node)
70
+ @nodes << node
71
+ end
72
+
73
+ def values
74
+ @nodes.map { |node| node.values }.flatten
75
+ end
76
+ end
77
+
78
+ class And < Composite
79
+ def evaluate(expr)
80
+ @nodes.each do |node|
81
+ return false unless node.evaluate(expr)
82
+ end
83
+ return true
84
+ end
85
+ end
86
+
87
+ class Or < Composite
88
+ def evaluate(expr)
89
+ @nodes.each do |node|
90
+ return true if node.evaluate(expr)
91
+ end
92
+ return false
93
+ end
94
+ end
95
+
96
+ class Unary
97
+ attr_reader :value
98
+
99
+ def initialize(value)
100
+ @value = value
101
+ @expressions = Expressions.new([@value])
102
+ end
103
+
104
+ def evaluate(expr)
105
+ @expressions.any?(expr)
106
+ end
107
+
108
+ def values
109
+ [value]
110
+ end
111
+ end
112
+
113
+ class Not < Unary
114
+ def evaluate(expr)
115
+ !super(expr)
116
+ end
117
+ def values
118
+ []
119
+ end
120
+ end
121
+
122
+ end
123
+ end
@@ -0,0 +1,36 @@
1
+ # encoding: UTF-8
2
+
3
+ class String
4
+
5
+ attr_accessor :normalized
6
+
7
+ class << self
8
+ attr_accessor :normalizer
9
+ attr_accessor :tokenizer
10
+ end
11
+
12
+ def normalize
13
+ unless normalized()
14
+ new_string = (String.normalizer || TextNlp::Normalizer.new).normalize(self)
15
+ new_string.normalized = true
16
+ return new_string
17
+ end
18
+ self
19
+ end
20
+
21
+ def tokenize
22
+ (String.tokenizer || TextNlp::Tokenizer.new).tokenize(self)
23
+ end
24
+
25
+ def similarity(text)
26
+ score = 0.0
27
+ tokens1 = self.normalize.tokenize
28
+ tokens2 = text.normalize.tokenize
29
+ if (tokens1.size > 0 && tokens2.size > 0)
30
+ intersection = tokens1 & tokens2
31
+ score = (((intersection.size.to_f / tokens1.size.to_f) + (intersection.size.to_f / tokens2.size.to_f)) / 2)
32
+ end
33
+ score
34
+ end
35
+
36
+ end
@@ -0,0 +1,9 @@
1
+ # encoding: UTF-8
2
+
3
+ class TextNlp
4
+ class Tokenizer
5
+ def tokenize(text)
6
+ text.split(/\s+/)
7
+ end
8
+ end
9
+ end
data/lib/text_nlp.rb CHANGED
@@ -3,6 +3,10 @@
3
3
  $:.unshift(File.dirname(__FILE__) + '/../lib')
4
4
 
5
5
  require "text_nlp/normalizer.rb"
6
+ require "text_nlp/tokenizer.rb"
7
+ require "text_nlp/string.rb"
8
+ require "text_nlp/expressions.rb"
9
+ require "text_nlp/pattern.rb"
6
10
 
7
11
  class TextNlp
8
12
  end
@@ -0,0 +1,33 @@
1
+ # encoding: utf-8
2
+ require "spec_helper"
3
+
4
+ describe TextNlp::Expressions do
5
+
6
+ it "should find the expressions" do
7
+ expression_values = ['nicolas sarkozy','nicolas sarkozy 1er de france','carla bruni','carla bruni sarkozy a']
8
+ etree = TextNlp::Expressions.new(expression_values)
9
+ expressions = etree.find('nicolas sarkozy 1er de italie est marie a carla bruni qui de fait est devenue carla bruni sarkozy a')
10
+ expressions.size.should eq 3
11
+ ['nicolas sarkozy','carla bruni','carla bruni sarkozy a'].each { |e| expressions.include?(e).should be_true }
12
+ expression_values.size.should eq etree.values.size
13
+ expression_values.each do |v|
14
+ etree.values.include?(v).should be_true
15
+ end
16
+ end
17
+
18
+ it "should expressionize the text" do
19
+ expression_values = ['nicolas sarkozy','nicolas sarkozy 1er de france','carla bruni','carla bruni sarkozy a']
20
+ etree = TextNlp::Expressions.new(expression_values)
21
+ expressions = etree.expressionize('nicolas sarkozy 1er de italie est marie a carla bruni qui de fait est devenue carla bruni sarkozy a')
22
+ expressions.should eq ['nicolas sarkozy','1er','de','italie','est','marie','a','carla bruni','qui','de','fait','est','devenue','carla bruni sarkozy a']
23
+ ['nicolas sarkozy','1er','de','italie','est','marie','a','carla bruni','qui','de','fait','est','devenue','carla bruni sarkozy a'].each { |e| expressions.include?(e).should be_true }
24
+ end
25
+
26
+ it "should returns true or false if any expression present in text" do
27
+ etree = TextNlp::Expressions.new(['olympique de marseille','lyon'])
28
+ etree.any?("l olympique de marseille").should be_true
29
+ etree.any?("lyon c est plus ce que c etait").should be_true
30
+ etree.any?("marseille pres du vieux port").should be_false
31
+ end
32
+
33
+ end
@@ -0,0 +1,13 @@
1
+ # encoding: utf-8
2
+ require "spec_helper"
3
+
4
+ describe TextNlp::Pattern do
5
+
6
+ it "should match or not the pattern" do
7
+ TextNlp::Pattern.new("((bd)||(bande dessinée))&&!samsung").match?("cette bd est super").should be_true
8
+ TextNlp::Pattern.new("((bd)||(bande dessinée))&&!samsung").match?("cette bd est illisible sur samsung NTC").should be_false
9
+ TextNlp::Pattern.new("((bd)||(bande dessinée))&&!samsung").match?("cette bande dessinée est illisible sur samsung NTC").should be_false
10
+ TextNlp::Pattern.new("((bd)||(bande dessinée))&&!samsung").match?("cette bande dessinée est illisible").should be_true
11
+ end
12
+
13
+ end
@@ -0,0 +1,36 @@
1
+ # encoding: utf-8
2
+ require "spec_helper"
3
+
4
+ describe String do
5
+
6
+ before(:each) { String.normalizer = nil; String.tokenizer = nil; }
7
+
8
+ it "should call normalizer" do
9
+ text = "TOTO"
10
+ normalizer = double()
11
+ String.normalizer = normalizer
12
+ normalizer.stub(:normalize) { |txt| txt.downcase }
13
+ normalizer.should_receive(:normalize).with(text)
14
+ text = text.normalize
15
+ text.should eq "TOTO".downcase
16
+ normalizer.should_not_receive(:normalize).with(text)
17
+ text.normalize.should eq "TOTO".downcase
18
+ end
19
+
20
+ it "should call tokenizer" do
21
+ text = "TOTO"
22
+ tokenizer = double()
23
+ String.tokenizer = tokenizer
24
+ tokenizer.should_receive(:tokenize).with(text)
25
+ text.tokenize
26
+ end
27
+
28
+ it "should compute similarity" do
29
+ "il fait chaud".similarity("il fait chaud").should eq 1.0
30
+ "il fait chaud".similarity("putin c nul ici").should eq 0.0
31
+ "il fait chaud".similarity("youhou ca le fait").should be_within(0.01).of(0.29)
32
+ "".similarity("il fait chaud").should eq 0.0
33
+ "il fait chaud".similarity("").should eq 0.0
34
+ end
35
+
36
+ end
@@ -0,0 +1,11 @@
1
+ # encoding: utf-8
2
+ require "spec_helper"
3
+
4
+ describe TextNlp::Tokenizer do
5
+
6
+ it "should tokenize text" do
7
+ n = TextNlp::Tokenizer.new
8
+ n.tokenize("Comment q'ça se fait ? blabla?").should eq ["Comment","q'ça","se","fait","?","blabla?"]
9
+ end
10
+
11
+ end
data/text_nlp.gemspec CHANGED
@@ -1,6 +1,6 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = 'text_nlp'
3
- s.version = '0.0.0'
3
+ s.version = '0.0.1'
4
4
  s.date = '2011-07-05'
5
5
  s.summary = "A minimalist NLP library"
6
6
  s.description = s.summary
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_nlp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.0
4
+ version: 0.0.1
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -26,10 +26,17 @@ files:
26
26
  - README
27
27
  - Rakefile
28
28
  - lib/text_nlp.rb
29
+ - lib/text_nlp/expressions.rb
29
30
  - lib/text_nlp/normalizer.rb
31
+ - lib/text_nlp/pattern.rb
32
+ - lib/text_nlp/string.rb
33
+ - lib/text_nlp/tokenizer.rb
34
+ - spec/expressions_spec.rb
30
35
  - spec/normalizer_spec.rb
36
+ - spec/pattern_spec.rb
31
37
  - spec/spec_helper.rb
32
- - text_nlp-0.0.0.gem
38
+ - spec/string_spec.rb
39
+ - spec/tokenizer_spec.rb
33
40
  - text_nlp.gemspec
34
41
  homepage: http://github.com/fonzo14/text_nlp
35
42
  licenses: []