text_nlp 0.0.0 → 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +2 -1
- data/lib/text_nlp/expressions.rb +75 -0
- data/lib/text_nlp/normalizer.rb +1 -3
- data/lib/text_nlp/pattern.rb +123 -0
- data/lib/text_nlp/string.rb +36 -0
- data/lib/text_nlp/tokenizer.rb +9 -0
- data/lib/text_nlp.rb +4 -0
- data/spec/expressions_spec.rb +33 -0
- data/spec/pattern_spec.rb +13 -0
- data/spec/string_spec.rb +36 -0
- data/spec/tokenizer_spec.rb +11 -0
- data/text_nlp.gemspec +1 -1
- metadata +9 -2
data/.gitignore
CHANGED
@@ -0,0 +1,75 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
class TextNlp
|
4
|
+
class Expressions
|
5
|
+
|
6
|
+
attr_accessor :values
|
7
|
+
|
8
|
+
def initialize(expressions = [])
|
9
|
+
@root, @values = {}, []
|
10
|
+
expressions.each { |expr| self << expr }
|
11
|
+
end
|
12
|
+
|
13
|
+
def <<(expression)
|
14
|
+
node = @root
|
15
|
+
expression = expression.normalize
|
16
|
+
@values << expression
|
17
|
+
tokens = expression.tokenize
|
18
|
+
tokens_count = tokens.size
|
19
|
+
tokens.each_with_index do |token,i|
|
20
|
+
unless node.key?(token)
|
21
|
+
node[token] = {}
|
22
|
+
node[token][:parent] = node
|
23
|
+
end
|
24
|
+
node = node[token]
|
25
|
+
if (i == (tokens_count-1)) # leaf
|
26
|
+
node[:leaf] = 1
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def any?(text)
|
32
|
+
(find(text).size > 0)
|
33
|
+
end
|
34
|
+
|
35
|
+
def expressionize(text)
|
36
|
+
expressions = find(text).sort { |e1,e2| e2.tokenize.size <=> e1.tokenize.size }
|
37
|
+
text = text.tokenize.join(',')
|
38
|
+
expressions.each { |expr| text.gsub!(expr.tokenize.join(','), expr) }
|
39
|
+
text.split(',')
|
40
|
+
end
|
41
|
+
|
42
|
+
def find(text)
|
43
|
+
find_expressions(0,text.normalize.tokenize.map { |t| t })
|
44
|
+
end
|
45
|
+
|
46
|
+
private
|
47
|
+
def find_expressions(start_index, tokens, expressions = [])
|
48
|
+
node, leaf, expr = @root, false, []
|
49
|
+
tokens[start_index..-1].each_with_index do |token,i|
|
50
|
+
if (node.key?(token))
|
51
|
+
node = node[token]
|
52
|
+
expr << token
|
53
|
+
else
|
54
|
+
while (expr.size > 0 && node)
|
55
|
+
if node.key?(:leaf)
|
56
|
+
expressions << expr.join(' ')
|
57
|
+
break
|
58
|
+
end
|
59
|
+
expr.pop
|
60
|
+
node = node[:parent]
|
61
|
+
end
|
62
|
+
break
|
63
|
+
end
|
64
|
+
end
|
65
|
+
start_index = expr.size > 0 ? (start_index + expr.size) : (start_index + 1)
|
66
|
+
if (start_index <= (tokens.size - 1))
|
67
|
+
find_expressions(start_index, tokens, expressions)
|
68
|
+
else
|
69
|
+
expressions << expr.join(' ') if (expr.size > 0 && node.key?(:leaf) )
|
70
|
+
end
|
71
|
+
expressions
|
72
|
+
end
|
73
|
+
|
74
|
+
end
|
75
|
+
end
|
data/lib/text_nlp/normalizer.rb
CHANGED
@@ -0,0 +1,123 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
class TextNlp
|
4
|
+
class Pattern
|
5
|
+
|
6
|
+
attr_reader :root
|
7
|
+
|
8
|
+
def initialize(root_or_string = nil)
|
9
|
+
if (root_or_string.is_a?(String))
|
10
|
+
@root = parse(root_or_string)
|
11
|
+
else
|
12
|
+
@root = root_or_string
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
def <<(node)
|
17
|
+
@root << node
|
18
|
+
end
|
19
|
+
|
20
|
+
def match?(text)
|
21
|
+
@root.evaluate(text)
|
22
|
+
end
|
23
|
+
|
24
|
+
private
|
25
|
+
def parse(expr)
|
26
|
+
operators = ['||','&&']
|
27
|
+
current_expression, node, opened, closed = '', nil, 0, 0
|
28
|
+
expr.chars.each_with_index do |char,i|
|
29
|
+
if (char == '(')
|
30
|
+
opened += 1
|
31
|
+
current_expression << char if ((opened - closed) > 1)
|
32
|
+
elsif (char == ')')
|
33
|
+
closed += 1
|
34
|
+
current_expression << char if ((opened - closed) > 0)
|
35
|
+
elsif ((opened == closed) && (operators.include?(expr[i-1..i])))
|
36
|
+
node = operator_node(expr[i-1..i])
|
37
|
+
node << parse(current_expression[0..-2])
|
38
|
+
node << parse(expr[i+1..-1])
|
39
|
+
break;
|
40
|
+
else
|
41
|
+
current_expression << char
|
42
|
+
end
|
43
|
+
end
|
44
|
+
unless node
|
45
|
+
if (current_expression.match(/\|{2}|&{2}/))
|
46
|
+
node = parse(current_expression)
|
47
|
+
else
|
48
|
+
node = current_expression[0..0] == '!' ? Not.new(current_expression[1..-1]) : Unary.new(current_expression)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
node
|
52
|
+
end
|
53
|
+
|
54
|
+
def operator_node(operator)
|
55
|
+
node = case operator
|
56
|
+
when '||' then Or.new
|
57
|
+
when '&&' then And.new
|
58
|
+
end
|
59
|
+
node
|
60
|
+
end
|
61
|
+
|
62
|
+
class Composite
|
63
|
+
attr_reader :nodes
|
64
|
+
|
65
|
+
def initialize(*nodes)
|
66
|
+
@nodes = nodes || []
|
67
|
+
end
|
68
|
+
|
69
|
+
def <<(node)
|
70
|
+
@nodes << node
|
71
|
+
end
|
72
|
+
|
73
|
+
def values
|
74
|
+
@nodes.map { |node| node.values }.flatten
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
class And < Composite
|
79
|
+
def evaluate(expr)
|
80
|
+
@nodes.each do |node|
|
81
|
+
return false unless node.evaluate(expr)
|
82
|
+
end
|
83
|
+
return true
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
class Or < Composite
|
88
|
+
def evaluate(expr)
|
89
|
+
@nodes.each do |node|
|
90
|
+
return true if node.evaluate(expr)
|
91
|
+
end
|
92
|
+
return false
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
class Unary
|
97
|
+
attr_reader :value
|
98
|
+
|
99
|
+
def initialize(value)
|
100
|
+
@value = value
|
101
|
+
@expressions = Expressions.new([@value])
|
102
|
+
end
|
103
|
+
|
104
|
+
def evaluate(expr)
|
105
|
+
@expressions.any?(expr)
|
106
|
+
end
|
107
|
+
|
108
|
+
def values
|
109
|
+
[value]
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
class Not < Unary
|
114
|
+
def evaluate(expr)
|
115
|
+
!super(expr)
|
116
|
+
end
|
117
|
+
def values
|
118
|
+
[]
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
end
|
123
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
class String
|
4
|
+
|
5
|
+
attr_accessor :normalized
|
6
|
+
|
7
|
+
class << self
|
8
|
+
attr_accessor :normalizer
|
9
|
+
attr_accessor :tokenizer
|
10
|
+
end
|
11
|
+
|
12
|
+
def normalize
|
13
|
+
unless normalized()
|
14
|
+
new_string = (String.normalizer || TextNlp::Normalizer.new).normalize(self)
|
15
|
+
new_string.normalized = true
|
16
|
+
return new_string
|
17
|
+
end
|
18
|
+
self
|
19
|
+
end
|
20
|
+
|
21
|
+
def tokenize
|
22
|
+
(String.tokenizer || TextNlp::Tokenizer.new).tokenize(self)
|
23
|
+
end
|
24
|
+
|
25
|
+
def similarity(text)
|
26
|
+
score = 0.0
|
27
|
+
tokens1 = self.normalize.tokenize
|
28
|
+
tokens2 = text.normalize.tokenize
|
29
|
+
if (tokens1.size > 0 && tokens2.size > 0)
|
30
|
+
intersection = tokens1 & tokens2
|
31
|
+
score = (((intersection.size.to_f / tokens1.size.to_f) + (intersection.size.to_f / tokens2.size.to_f)) / 2)
|
32
|
+
end
|
33
|
+
score
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
data/lib/text_nlp.rb
CHANGED
@@ -0,0 +1,33 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require "spec_helper"
|
3
|
+
|
4
|
+
describe TextNlp::Expressions do
|
5
|
+
|
6
|
+
it "should find the expressions" do
|
7
|
+
expression_values = ['nicolas sarkozy','nicolas sarkozy 1er de france','carla bruni','carla bruni sarkozy a']
|
8
|
+
etree = TextNlp::Expressions.new(expression_values)
|
9
|
+
expressions = etree.find('nicolas sarkozy 1er de italie est marie a carla bruni qui de fait est devenue carla bruni sarkozy a')
|
10
|
+
expressions.size.should eq 3
|
11
|
+
['nicolas sarkozy','carla bruni','carla bruni sarkozy a'].each { |e| expressions.include?(e).should be_true }
|
12
|
+
expression_values.size.should eq etree.values.size
|
13
|
+
expression_values.each do |v|
|
14
|
+
etree.values.include?(v).should be_true
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
it "should expressionize the text" do
|
19
|
+
expression_values = ['nicolas sarkozy','nicolas sarkozy 1er de france','carla bruni','carla bruni sarkozy a']
|
20
|
+
etree = TextNlp::Expressions.new(expression_values)
|
21
|
+
expressions = etree.expressionize('nicolas sarkozy 1er de italie est marie a carla bruni qui de fait est devenue carla bruni sarkozy a')
|
22
|
+
expressions.should eq ['nicolas sarkozy','1er','de','italie','est','marie','a','carla bruni','qui','de','fait','est','devenue','carla bruni sarkozy a']
|
23
|
+
['nicolas sarkozy','1er','de','italie','est','marie','a','carla bruni','qui','de','fait','est','devenue','carla bruni sarkozy a'].each { |e| expressions.include?(e).should be_true }
|
24
|
+
end
|
25
|
+
|
26
|
+
it "should returns true or false if any expression present in text" do
|
27
|
+
etree = TextNlp::Expressions.new(['olympique de marseille','lyon'])
|
28
|
+
etree.any?("l olympique de marseille").should be_true
|
29
|
+
etree.any?("lyon c est plus ce que c etait").should be_true
|
30
|
+
etree.any?("marseille pres du vieux port").should be_false
|
31
|
+
end
|
32
|
+
|
33
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require "spec_helper"
|
3
|
+
|
4
|
+
describe TextNlp::Pattern do
|
5
|
+
|
6
|
+
it "should match or not the pattern" do
|
7
|
+
TextNlp::Pattern.new("((bd)||(bande dessinée))&&!samsung").match?("cette bd est super").should be_true
|
8
|
+
TextNlp::Pattern.new("((bd)||(bande dessinée))&&!samsung").match?("cette bd est illisible sur samsung NTC").should be_false
|
9
|
+
TextNlp::Pattern.new("((bd)||(bande dessinée))&&!samsung").match?("cette bande dessinée est illisible sur samsung NTC").should be_false
|
10
|
+
TextNlp::Pattern.new("((bd)||(bande dessinée))&&!samsung").match?("cette bande dessinée est illisible").should be_true
|
11
|
+
end
|
12
|
+
|
13
|
+
end
|
data/spec/string_spec.rb
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require "spec_helper"
|
3
|
+
|
4
|
+
describe String do
|
5
|
+
|
6
|
+
before(:each) { String.normalizer = nil; String.tokenizer = nil; }
|
7
|
+
|
8
|
+
it "should call normalizer" do
|
9
|
+
text = "TOTO"
|
10
|
+
normalizer = double()
|
11
|
+
String.normalizer = normalizer
|
12
|
+
normalizer.stub(:normalize) { |txt| txt.downcase }
|
13
|
+
normalizer.should_receive(:normalize).with(text)
|
14
|
+
text = text.normalize
|
15
|
+
text.should eq "TOTO".downcase
|
16
|
+
normalizer.should_not_receive(:normalize).with(text)
|
17
|
+
text.normalize.should eq "TOTO".downcase
|
18
|
+
end
|
19
|
+
|
20
|
+
it "should call tokenizer" do
|
21
|
+
text = "TOTO"
|
22
|
+
tokenizer = double()
|
23
|
+
String.tokenizer = tokenizer
|
24
|
+
tokenizer.should_receive(:tokenize).with(text)
|
25
|
+
text.tokenize
|
26
|
+
end
|
27
|
+
|
28
|
+
it "should compute similarity" do
|
29
|
+
"il fait chaud".similarity("il fait chaud").should eq 1.0
|
30
|
+
"il fait chaud".similarity("putin c nul ici").should eq 0.0
|
31
|
+
"il fait chaud".similarity("youhou ca le fait").should be_within(0.01).of(0.29)
|
32
|
+
"".similarity("il fait chaud").should eq 0.0
|
33
|
+
"il fait chaud".similarity("").should eq 0.0
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
data/text_nlp.gemspec
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_nlp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -26,10 +26,17 @@ files:
|
|
26
26
|
- README
|
27
27
|
- Rakefile
|
28
28
|
- lib/text_nlp.rb
|
29
|
+
- lib/text_nlp/expressions.rb
|
29
30
|
- lib/text_nlp/normalizer.rb
|
31
|
+
- lib/text_nlp/pattern.rb
|
32
|
+
- lib/text_nlp/string.rb
|
33
|
+
- lib/text_nlp/tokenizer.rb
|
34
|
+
- spec/expressions_spec.rb
|
30
35
|
- spec/normalizer_spec.rb
|
36
|
+
- spec/pattern_spec.rb
|
31
37
|
- spec/spec_helper.rb
|
32
|
-
-
|
38
|
+
- spec/string_spec.rb
|
39
|
+
- spec/tokenizer_spec.rb
|
33
40
|
- text_nlp.gemspec
|
34
41
|
homepage: http://github.com/fonzo14/text_nlp
|
35
42
|
licenses: []
|