text_nlp 0.0.0 → 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +2 -1
- data/lib/text_nlp/expressions.rb +75 -0
- data/lib/text_nlp/normalizer.rb +1 -3
- data/lib/text_nlp/pattern.rb +123 -0
- data/lib/text_nlp/string.rb +36 -0
- data/lib/text_nlp/tokenizer.rb +9 -0
- data/lib/text_nlp.rb +4 -0
- data/spec/expressions_spec.rb +33 -0
- data/spec/pattern_spec.rb +13 -0
- data/spec/string_spec.rb +36 -0
- data/spec/tokenizer_spec.rb +11 -0
- data/text_nlp.gemspec +1 -1
- metadata +9 -2
data/.gitignore
CHANGED
@@ -0,0 +1,75 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
class TextNlp
|
4
|
+
class Expressions
|
5
|
+
|
6
|
+
attr_accessor :values
|
7
|
+
|
8
|
+
def initialize(expressions = [])
|
9
|
+
@root, @values = {}, []
|
10
|
+
expressions.each { |expr| self << expr }
|
11
|
+
end
|
12
|
+
|
13
|
+
def <<(expression)
|
14
|
+
node = @root
|
15
|
+
expression = expression.normalize
|
16
|
+
@values << expression
|
17
|
+
tokens = expression.tokenize
|
18
|
+
tokens_count = tokens.size
|
19
|
+
tokens.each_with_index do |token,i|
|
20
|
+
unless node.key?(token)
|
21
|
+
node[token] = {}
|
22
|
+
node[token][:parent] = node
|
23
|
+
end
|
24
|
+
node = node[token]
|
25
|
+
if (i == (tokens_count-1)) # leaf
|
26
|
+
node[:leaf] = 1
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def any?(text)
|
32
|
+
(find(text).size > 0)
|
33
|
+
end
|
34
|
+
|
35
|
+
def expressionize(text)
|
36
|
+
expressions = find(text).sort { |e1,e2| e2.tokenize.size <=> e1.tokenize.size }
|
37
|
+
text = text.tokenize.join(',')
|
38
|
+
expressions.each { |expr| text.gsub!(expr.tokenize.join(','), expr) }
|
39
|
+
text.split(',')
|
40
|
+
end
|
41
|
+
|
42
|
+
def find(text)
|
43
|
+
find_expressions(0,text.normalize.tokenize.map { |t| t })
|
44
|
+
end
|
45
|
+
|
46
|
+
private
|
47
|
+
def find_expressions(start_index, tokens, expressions = [])
|
48
|
+
node, leaf, expr = @root, false, []
|
49
|
+
tokens[start_index..-1].each_with_index do |token,i|
|
50
|
+
if (node.key?(token))
|
51
|
+
node = node[token]
|
52
|
+
expr << token
|
53
|
+
else
|
54
|
+
while (expr.size > 0 && node)
|
55
|
+
if node.key?(:leaf)
|
56
|
+
expressions << expr.join(' ')
|
57
|
+
break
|
58
|
+
end
|
59
|
+
expr.pop
|
60
|
+
node = node[:parent]
|
61
|
+
end
|
62
|
+
break
|
63
|
+
end
|
64
|
+
end
|
65
|
+
start_index = expr.size > 0 ? (start_index + expr.size) : (start_index + 1)
|
66
|
+
if (start_index <= (tokens.size - 1))
|
67
|
+
find_expressions(start_index, tokens, expressions)
|
68
|
+
else
|
69
|
+
expressions << expr.join(' ') if (expr.size > 0 && node.key?(:leaf) )
|
70
|
+
end
|
71
|
+
expressions
|
72
|
+
end
|
73
|
+
|
74
|
+
end
|
75
|
+
end
|
data/lib/text_nlp/normalizer.rb
CHANGED
@@ -0,0 +1,123 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
class TextNlp
|
4
|
+
class Pattern
|
5
|
+
|
6
|
+
attr_reader :root
|
7
|
+
|
8
|
+
def initialize(root_or_string = nil)
|
9
|
+
if (root_or_string.is_a?(String))
|
10
|
+
@root = parse(root_or_string)
|
11
|
+
else
|
12
|
+
@root = root_or_string
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
def <<(node)
|
17
|
+
@root << node
|
18
|
+
end
|
19
|
+
|
20
|
+
def match?(text)
|
21
|
+
@root.evaluate(text)
|
22
|
+
end
|
23
|
+
|
24
|
+
private
|
25
|
+
def parse(expr)
|
26
|
+
operators = ['||','&&']
|
27
|
+
current_expression, node, opened, closed = '', nil, 0, 0
|
28
|
+
expr.chars.each_with_index do |char,i|
|
29
|
+
if (char == '(')
|
30
|
+
opened += 1
|
31
|
+
current_expression << char if ((opened - closed) > 1)
|
32
|
+
elsif (char == ')')
|
33
|
+
closed += 1
|
34
|
+
current_expression << char if ((opened - closed) > 0)
|
35
|
+
elsif ((opened == closed) && (operators.include?(expr[i-1..i])))
|
36
|
+
node = operator_node(expr[i-1..i])
|
37
|
+
node << parse(current_expression[0..-2])
|
38
|
+
node << parse(expr[i+1..-1])
|
39
|
+
break;
|
40
|
+
else
|
41
|
+
current_expression << char
|
42
|
+
end
|
43
|
+
end
|
44
|
+
unless node
|
45
|
+
if (current_expression.match(/\|{2}|&{2}/))
|
46
|
+
node = parse(current_expression)
|
47
|
+
else
|
48
|
+
node = current_expression[0..0] == '!' ? Not.new(current_expression[1..-1]) : Unary.new(current_expression)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
node
|
52
|
+
end
|
53
|
+
|
54
|
+
def operator_node(operator)
|
55
|
+
node = case operator
|
56
|
+
when '||' then Or.new
|
57
|
+
when '&&' then And.new
|
58
|
+
end
|
59
|
+
node
|
60
|
+
end
|
61
|
+
|
62
|
+
class Composite
|
63
|
+
attr_reader :nodes
|
64
|
+
|
65
|
+
def initialize(*nodes)
|
66
|
+
@nodes = nodes || []
|
67
|
+
end
|
68
|
+
|
69
|
+
def <<(node)
|
70
|
+
@nodes << node
|
71
|
+
end
|
72
|
+
|
73
|
+
def values
|
74
|
+
@nodes.map { |node| node.values }.flatten
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
class And < Composite
|
79
|
+
def evaluate(expr)
|
80
|
+
@nodes.each do |node|
|
81
|
+
return false unless node.evaluate(expr)
|
82
|
+
end
|
83
|
+
return true
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
class Or < Composite
|
88
|
+
def evaluate(expr)
|
89
|
+
@nodes.each do |node|
|
90
|
+
return true if node.evaluate(expr)
|
91
|
+
end
|
92
|
+
return false
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
class Unary
|
97
|
+
attr_reader :value
|
98
|
+
|
99
|
+
def initialize(value)
|
100
|
+
@value = value
|
101
|
+
@expressions = Expressions.new([@value])
|
102
|
+
end
|
103
|
+
|
104
|
+
def evaluate(expr)
|
105
|
+
@expressions.any?(expr)
|
106
|
+
end
|
107
|
+
|
108
|
+
def values
|
109
|
+
[value]
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
class Not < Unary
|
114
|
+
def evaluate(expr)
|
115
|
+
!super(expr)
|
116
|
+
end
|
117
|
+
def values
|
118
|
+
[]
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
end
|
123
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
class String
|
4
|
+
|
5
|
+
attr_accessor :normalized
|
6
|
+
|
7
|
+
class << self
|
8
|
+
attr_accessor :normalizer
|
9
|
+
attr_accessor :tokenizer
|
10
|
+
end
|
11
|
+
|
12
|
+
def normalize
|
13
|
+
unless normalized()
|
14
|
+
new_string = (String.normalizer || TextNlp::Normalizer.new).normalize(self)
|
15
|
+
new_string.normalized = true
|
16
|
+
return new_string
|
17
|
+
end
|
18
|
+
self
|
19
|
+
end
|
20
|
+
|
21
|
+
def tokenize
|
22
|
+
(String.tokenizer || TextNlp::Tokenizer.new).tokenize(self)
|
23
|
+
end
|
24
|
+
|
25
|
+
def similarity(text)
|
26
|
+
score = 0.0
|
27
|
+
tokens1 = self.normalize.tokenize
|
28
|
+
tokens2 = text.normalize.tokenize
|
29
|
+
if (tokens1.size > 0 && tokens2.size > 0)
|
30
|
+
intersection = tokens1 & tokens2
|
31
|
+
score = (((intersection.size.to_f / tokens1.size.to_f) + (intersection.size.to_f / tokens2.size.to_f)) / 2)
|
32
|
+
end
|
33
|
+
score
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
data/lib/text_nlp.rb
CHANGED
@@ -0,0 +1,33 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require "spec_helper"
|
3
|
+
|
4
|
+
describe TextNlp::Expressions do
|
5
|
+
|
6
|
+
it "should find the expressions" do
|
7
|
+
expression_values = ['nicolas sarkozy','nicolas sarkozy 1er de france','carla bruni','carla bruni sarkozy a']
|
8
|
+
etree = TextNlp::Expressions.new(expression_values)
|
9
|
+
expressions = etree.find('nicolas sarkozy 1er de italie est marie a carla bruni qui de fait est devenue carla bruni sarkozy a')
|
10
|
+
expressions.size.should eq 3
|
11
|
+
['nicolas sarkozy','carla bruni','carla bruni sarkozy a'].each { |e| expressions.include?(e).should be_true }
|
12
|
+
expression_values.size.should eq etree.values.size
|
13
|
+
expression_values.each do |v|
|
14
|
+
etree.values.include?(v).should be_true
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
it "should expressionize the text" do
|
19
|
+
expression_values = ['nicolas sarkozy','nicolas sarkozy 1er de france','carla bruni','carla bruni sarkozy a']
|
20
|
+
etree = TextNlp::Expressions.new(expression_values)
|
21
|
+
expressions = etree.expressionize('nicolas sarkozy 1er de italie est marie a carla bruni qui de fait est devenue carla bruni sarkozy a')
|
22
|
+
expressions.should eq ['nicolas sarkozy','1er','de','italie','est','marie','a','carla bruni','qui','de','fait','est','devenue','carla bruni sarkozy a']
|
23
|
+
['nicolas sarkozy','1er','de','italie','est','marie','a','carla bruni','qui','de','fait','est','devenue','carla bruni sarkozy a'].each { |e| expressions.include?(e).should be_true }
|
24
|
+
end
|
25
|
+
|
26
|
+
it "should returns true or false if any expression present in text" do
|
27
|
+
etree = TextNlp::Expressions.new(['olympique de marseille','lyon'])
|
28
|
+
etree.any?("l olympique de marseille").should be_true
|
29
|
+
etree.any?("lyon c est plus ce que c etait").should be_true
|
30
|
+
etree.any?("marseille pres du vieux port").should be_false
|
31
|
+
end
|
32
|
+
|
33
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require "spec_helper"
|
3
|
+
|
4
|
+
describe TextNlp::Pattern do
|
5
|
+
|
6
|
+
it "should match or not the pattern" do
|
7
|
+
TextNlp::Pattern.new("((bd)||(bande dessinée))&&!samsung").match?("cette bd est super").should be_true
|
8
|
+
TextNlp::Pattern.new("((bd)||(bande dessinée))&&!samsung").match?("cette bd est illisible sur samsung NTC").should be_false
|
9
|
+
TextNlp::Pattern.new("((bd)||(bande dessinée))&&!samsung").match?("cette bande dessinée est illisible sur samsung NTC").should be_false
|
10
|
+
TextNlp::Pattern.new("((bd)||(bande dessinée))&&!samsung").match?("cette bande dessinée est illisible").should be_true
|
11
|
+
end
|
12
|
+
|
13
|
+
end
|
data/spec/string_spec.rb
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require "spec_helper"
|
3
|
+
|
4
|
+
describe String do
|
5
|
+
|
6
|
+
before(:each) { String.normalizer = nil; String.tokenizer = nil; }
|
7
|
+
|
8
|
+
it "should call normalizer" do
|
9
|
+
text = "TOTO"
|
10
|
+
normalizer = double()
|
11
|
+
String.normalizer = normalizer
|
12
|
+
normalizer.stub(:normalize) { |txt| txt.downcase }
|
13
|
+
normalizer.should_receive(:normalize).with(text)
|
14
|
+
text = text.normalize
|
15
|
+
text.should eq "TOTO".downcase
|
16
|
+
normalizer.should_not_receive(:normalize).with(text)
|
17
|
+
text.normalize.should eq "TOTO".downcase
|
18
|
+
end
|
19
|
+
|
20
|
+
it "should call tokenizer" do
|
21
|
+
text = "TOTO"
|
22
|
+
tokenizer = double()
|
23
|
+
String.tokenizer = tokenizer
|
24
|
+
tokenizer.should_receive(:tokenize).with(text)
|
25
|
+
text.tokenize
|
26
|
+
end
|
27
|
+
|
28
|
+
it "should compute similarity" do
|
29
|
+
"il fait chaud".similarity("il fait chaud").should eq 1.0
|
30
|
+
"il fait chaud".similarity("putin c nul ici").should eq 0.0
|
31
|
+
"il fait chaud".similarity("youhou ca le fait").should be_within(0.01).of(0.29)
|
32
|
+
"".similarity("il fait chaud").should eq 0.0
|
33
|
+
"il fait chaud".similarity("").should eq 0.0
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
data/text_nlp.gemspec
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_nlp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -26,10 +26,17 @@ files:
|
|
26
26
|
- README
|
27
27
|
- Rakefile
|
28
28
|
- lib/text_nlp.rb
|
29
|
+
- lib/text_nlp/expressions.rb
|
29
30
|
- lib/text_nlp/normalizer.rb
|
31
|
+
- lib/text_nlp/pattern.rb
|
32
|
+
- lib/text_nlp/string.rb
|
33
|
+
- lib/text_nlp/tokenizer.rb
|
34
|
+
- spec/expressions_spec.rb
|
30
35
|
- spec/normalizer_spec.rb
|
36
|
+
- spec/pattern_spec.rb
|
31
37
|
- spec/spec_helper.rb
|
32
|
-
-
|
38
|
+
- spec/string_spec.rb
|
39
|
+
- spec/tokenizer_spec.rb
|
33
40
|
- text_nlp.gemspec
|
34
41
|
homepage: http://github.com/fonzo14/text_nlp
|
35
42
|
licenses: []
|