nlp 0.2.5 → 0.2.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/analyzer.rb +15 -47
- data/lib/category.rb +7 -8
- data/lib/dictionary.rb +30 -28
- data/lib/emoticon.rb +8 -8
- data/lib/inflectable.rb +58 -59
- data/lib/lemmatizer.rb +86 -82
- data/lib/liwc_analyzer.rb +68 -91
- data/lib/liwc_category.rb +42 -43
- data/lib/meaningable.rb +44 -51
- data/lib/nlp.rb +10 -0
- data/lib/rid_analyzer.rb +5 -69
- data/lib/rid_category.rb +5 -6
- data/lib/sentence.rb +19 -11
- data/lib/statistic.rb +55 -0
- data/lib/stdlib/ext/array.rb +7 -0
- data/lib/stree.rb +39 -39
- data/lib/takipi_web_service.rb +45 -45
- data/lib/text.rb +18 -17
- data/lib/token.rb +28 -25
- data/lib/token_scanner.rb +43 -55
- data/lib/word.rb +14 -14
- data/test/analyzer_test.rb +25 -0
- data/test/lemmatizer_test.rb +73 -0
- data/test/meaningable_test.rb +28 -0
- data/test/nlp_test_suite.rb +11 -0
- data/test/sentence_test.rb +26 -0
- data/test/text_test.rb +29 -0
- data/test/token_scanner_test.rb +28 -0
- data/test/token_test.rb +37 -0
- data/test/word_test.rb +39 -36
- metadata +21 -5
- data/lib/takipi_web_service +0 -0
data/lib/rid_category.rb
CHANGED
@@ -1,18 +1,17 @@
|
|
1
1
|
module NLP
|
2
|
-
|
3
|
-
|
2
|
+
class RIDCategory < Category
|
3
|
+
|
4
4
|
def primary?
|
5
5
|
root == :PIERWOTNE
|
6
6
|
end
|
7
|
-
|
7
|
+
|
8
8
|
def secondary?
|
9
9
|
root == :WTORNE
|
10
10
|
end
|
11
|
-
|
11
|
+
|
12
12
|
def emotions?
|
13
13
|
root == :EMOCJE
|
14
14
|
end
|
15
15
|
|
16
|
-
|
17
|
-
end
|
16
|
+
end
|
18
17
|
end
|
data/lib/sentence.rb
CHANGED
@@ -1,16 +1,24 @@
|
|
1
1
|
module NLP
|
2
|
-
|
3
|
-
attr_reader :tokens
|
4
|
-
def initialize()
|
5
|
-
@tokens = []
|
6
|
-
end
|
2
|
+
class Sentence
|
7
3
|
|
8
|
-
|
9
|
-
@tokens.concat tokens
|
10
|
-
end
|
4
|
+
attr_reader :tokens
|
11
5
|
|
12
|
-
|
13
|
-
|
14
|
-
end
|
6
|
+
def initialize()
|
7
|
+
@tokens = []
|
15
8
|
end
|
9
|
+
|
10
|
+
def << tokens
|
11
|
+
if tokens.is_a? Array
|
12
|
+
@tokens.concat tokens
|
13
|
+
else
|
14
|
+
@tokens << tokens
|
15
|
+
end
|
16
|
+
self
|
17
|
+
end
|
18
|
+
|
19
|
+
def words_number
|
20
|
+
@tokens.count{|t| !t.interp?}
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
16
24
|
end
|
data/lib/statistic.rb
ADDED
@@ -0,0 +1,55 @@
|
|
1
|
+
class Statistic
|
2
|
+
|
3
|
+
attr_accessor :total_words, :hash
|
4
|
+
attr_reader :cwords, :words, :total_words, :word_count
|
5
|
+
|
6
|
+
def initialize
|
7
|
+
@word_count = 0
|
8
|
+
@total_words = 0
|
9
|
+
@scores = Hash.new { 0 }
|
10
|
+
@words = []
|
11
|
+
@cwords = Hash.new {nil}
|
12
|
+
@hash
|
13
|
+
end
|
14
|
+
|
15
|
+
def add(word,category)
|
16
|
+
|
17
|
+
@scores[category] += 1
|
18
|
+
@word_count += 1
|
19
|
+
@words.push word
|
20
|
+
|
21
|
+
category = category.name
|
22
|
+
if @cwords[category].nil?
|
23
|
+
@cwords[category] = []
|
24
|
+
end
|
25
|
+
@cwords[category].push word
|
26
|
+
|
27
|
+
end
|
28
|
+
|
29
|
+
def []=(key,value)
|
30
|
+
@hash[key] = value
|
31
|
+
end
|
32
|
+
|
33
|
+
def [](key)
|
34
|
+
@hash[key]
|
35
|
+
end
|
36
|
+
|
37
|
+
def category_participation(categories)
|
38
|
+
sorted_scores = @scores.to_a.sort_by { |result| -result[1] }
|
39
|
+
r = {}
|
40
|
+
categories.each do |cat|
|
41
|
+
r[cat] = percentage_distribution(sorted_scores){|c| c.send(cat.to_s+'?')}
|
42
|
+
end
|
43
|
+
r
|
44
|
+
end
|
45
|
+
|
46
|
+
private
|
47
|
+
|
48
|
+
def percentage_distribution scores, &block
|
49
|
+
sum = scores.select{|result| yield result[0]}.inject(0){|count,result| count + result[1]}
|
50
|
+
Float(sum)/@word_count
|
51
|
+
end
|
52
|
+
|
53
|
+
end
|
54
|
+
|
55
|
+
|
data/lib/stdlib/ext/array.rb
CHANGED
data/lib/stree.rb
CHANGED
@@ -5,53 +5,53 @@ module NLP
|
|
5
5
|
SYMBOLS = %w{* - : - / ) (}
|
6
6
|
attr_accessor :value
|
7
7
|
attr_accessor :subtrees
|
8
|
-
|
8
|
+
|
9
9
|
# 0 -> *
|
10
10
|
# 1 -> -
|
11
11
|
# 2 -> a
|
12
12
|
# 33 -> ź
|
13
13
|
def initialize
|
14
|
-
@subtrees = Array.new(
|
14
|
+
@subtrees = Array.new(34, nil)
|
15
15
|
@value = []
|
16
16
|
end
|
17
|
-
|
18
|
-
def insert(
|
19
|
-
priv_insert(
|
17
|
+
|
18
|
+
def insert(s, value)
|
19
|
+
priv_insert(s.scan(/./), value)
|
20
20
|
end
|
21
|
-
|
22
|
-
def find(
|
23
|
-
priv_find(
|
21
|
+
|
22
|
+
def find(s)
|
23
|
+
priv_find(s.scan(/./))
|
24
24
|
end
|
25
25
|
|
26
|
-
|
27
|
-
|
26
|
+
|
27
|
+
protected
|
28
28
|
def key( chr )
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
29
|
+
unless chr
|
30
|
+
raise ArgumentError, "Argument chr is nil"
|
31
|
+
end
|
32
|
+
rval = ALPHABET.index(chr) || -1
|
33
|
+
if rval > 35
|
34
|
+
rval = -1 # invalid character
|
35
|
+
end
|
36
|
+
|
37
|
+
rval
|
38
38
|
end
|
39
|
-
|
39
|
+
|
40
40
|
def priv_insert( s, value )
|
41
41
|
if s.empty?
|
42
42
|
@value.push value
|
43
43
|
else
|
44
44
|
index = key( s.first )
|
45
45
|
subtree = if @subtrees[index] == nil
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
46
|
+
@subtrees[index] = SearchTree.new
|
47
|
+
else
|
48
|
+
@subtrees[index]
|
49
|
+
end
|
50
|
+
|
51
51
|
subtree.priv_insert( s.tail, value )
|
52
52
|
end
|
53
53
|
end
|
54
|
-
|
54
|
+
|
55
55
|
def priv_find( search )
|
56
56
|
if @subtrees[0]
|
57
57
|
@subtrees[0].value
|
@@ -69,17 +69,17 @@ module NLP
|
|
69
69
|
end
|
70
70
|
end
|
71
71
|
|
72
|
-
public
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
end
|
84
|
-
end
|
72
|
+
public
|
73
|
+
def traverse()
|
74
|
+
list = []
|
75
|
+
yield @value
|
76
|
+
list.concat @subrees if @subtrees != nil
|
77
|
+
loop do
|
78
|
+
break if list.empty?
|
79
|
+
node = list.shift
|
80
|
+
yield node.value
|
81
|
+
list.concat node.subtrees if node.subtrees != nil
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
85
|
end
|
data/lib/takipi_web_service.rb
CHANGED
@@ -2,50 +2,50 @@ require 'rubygems'
|
|
2
2
|
require 'savon'
|
3
3
|
|
4
4
|
class TakipiWebService
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
5
|
+
URL = 'http://nlp.pwr.wroc.pl/clarin/ws/takipi/'
|
6
|
+
WSDL_URL = URL + 'takipi.wsdl'
|
7
|
+
|
8
|
+
def self.request(text)
|
9
|
+
client = Savon::Client.new WSDL_URL, :soap_endpoint => URL
|
10
|
+
|
11
|
+
# Call remote service methods
|
12
|
+
response = client.tag do |soap|
|
13
|
+
soap.body = "<text>#{text}</text><format>TXT</format><useGuesser>true</useGuesser>"
|
14
|
+
end
|
15
|
+
|
16
|
+
response = response.to_hash
|
17
|
+
token = response[:tag_response][:tag_response][:msg]
|
18
|
+
status = (response[:tag_response][:tag_response][:status]).to_i
|
19
|
+
|
20
|
+
#checking status
|
21
|
+
timeout = 60
|
22
|
+
step = 5
|
23
|
+
count = 0
|
24
|
+
loop do
|
25
|
+
break if count > timeout
|
26
|
+
if status == 1
|
27
|
+
break
|
28
|
+
elsif status == 2 or status == 3
|
29
|
+
count += 5
|
30
|
+
sleep(1)
|
31
|
+
r = client.get_status do |soap|
|
32
|
+
soap.body = "<token>#{token}</token>"
|
33
|
+
end.to_hash
|
34
|
+
status = (r[:get_status_response][:status]).to_i
|
35
|
+
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
#geting result
|
40
|
+
|
41
|
+
result = client.get_result do |soap|
|
42
|
+
soap.body="<token>#{token}</token>"
|
43
|
+
end
|
44
|
+
|
45
|
+
response_document = result.to_hash[:get_result_response][:tag_response][:msg]
|
46
|
+
|
47
|
+
#transforming response to well formed xml string
|
48
|
+
return "<xml><chunkList>#{response_document}</chunkList></xml>"
|
49
|
+
end
|
50
50
|
end
|
51
51
|
|
data/lib/text.rb
CHANGED
@@ -1,25 +1,26 @@
|
|
1
1
|
module NLP
|
2
|
-
|
3
|
-
|
2
|
+
class Text
|
3
|
+
attr_reader :sentences
|
4
4
|
|
5
|
-
|
6
|
-
|
7
|
-
|
5
|
+
def initialize
|
6
|
+
@sentences = []
|
7
|
+
end
|
8
8
|
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
def << sentence
|
10
|
+
@sentences.push sentence
|
11
|
+
end
|
12
12
|
|
13
13
|
|
14
|
-
|
15
|
-
|
16
|
-
|
14
|
+
def words_per_sentence
|
15
|
+
@sentences.collect{|s| s.words_number}.mean
|
16
|
+
end
|
17
17
|
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
end
|
18
|
+
|
19
|
+
def flatten
|
20
|
+
flattened = []
|
21
|
+
@sentences.each{ |s| s.tokens.each{|t| flattened.push t } }
|
22
|
+
flattened
|
24
23
|
end
|
24
|
+
|
25
|
+
end
|
25
26
|
end
|
data/lib/token.rb
CHANGED
@@ -1,34 +1,37 @@
|
|
1
|
-
|
2
1
|
module NLP
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
2
|
+
class Token
|
3
|
+
|
4
|
+
attr_reader :orth
|
5
|
+
attr_reader :tags
|
6
|
+
|
7
|
+
def initialize(orth,tags)
|
8
|
+
@orth = orth
|
9
|
+
@tags = tags
|
10
|
+
end
|
12
11
|
|
13
|
-
|
14
|
-
|
15
|
-
|
12
|
+
def symbol?
|
13
|
+
@tags.eql? "tsym"
|
14
|
+
end
|
16
15
|
|
17
|
-
|
18
|
-
|
19
|
-
|
16
|
+
def interp?
|
17
|
+
@tags.eql? "interp"
|
18
|
+
end
|
20
19
|
|
21
|
-
|
22
|
-
|
23
|
-
|
20
|
+
def word?
|
21
|
+
not interp? and not number?
|
22
|
+
end
|
24
23
|
|
25
|
-
|
26
|
-
|
27
|
-
|
24
|
+
def number?
|
25
|
+
@tags.include?("tnum")
|
26
|
+
end
|
28
27
|
|
29
|
-
|
30
|
-
|
31
|
-
|
28
|
+
def integer?
|
29
|
+
@tags.include?("tnum:integer")
|
30
|
+
end
|
32
31
|
|
32
|
+
def float?
|
33
|
+
@tags.include?("tnum:frac")
|
33
34
|
end
|
35
|
+
|
36
|
+
end
|
34
37
|
end
|
data/lib/token_scanner.rb
CHANGED
@@ -1,72 +1,60 @@
|
|
1
|
-
|
2
1
|
module NLP
|
3
|
-
|
4
|
-
|
5
|
-
attr_reader :text, :tokens
|
2
|
+
class TokenScanner
|
6
3
|
|
7
|
-
|
8
|
-
@text = text
|
9
|
-
@pos = 0
|
10
|
-
@tokens = flatten_text(@text)
|
11
|
-
end
|
4
|
+
attr_reader :text, :tokens
|
12
5
|
|
13
|
-
|
14
|
-
|
6
|
+
def initialize(text)
|
7
|
+
@text = text
|
8
|
+
@pos = 0
|
9
|
+
@tokens = @text.flatten
|
10
|
+
end
|
15
11
|
|
16
|
-
|
17
|
-
|
18
|
-
while @pos < @tokens.size and !@tokens[@pos].word?
|
19
|
-
@pos+= 1
|
20
|
-
end
|
12
|
+
def next(type)
|
13
|
+
@pos+=1
|
21
14
|
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
when :number
|
28
|
-
while @pos < @tokens.size and !@tokens[@pos].number?
|
29
|
-
@pos+= 1
|
30
|
-
end
|
31
|
-
when :alphanum
|
32
|
-
while @pos < @tokens.size and !@tokens[@pos].number? and !@tokens[@pos].word?
|
33
|
-
@pos+= 1
|
34
|
-
end
|
35
|
-
end
|
15
|
+
case type
|
16
|
+
when :word
|
17
|
+
while @pos < @tokens.size and !@tokens[@pos].word?
|
18
|
+
@pos+= 1
|
36
19
|
end
|
37
20
|
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
nil
|
42
|
-
else
|
43
|
-
@tokens[@pos]
|
44
|
-
end
|
45
|
-
|
21
|
+
when :interp
|
22
|
+
while @pos < @tokens.size and !@tokens[@pos].interp?
|
23
|
+
@pos+= 1
|
46
24
|
end
|
47
25
|
|
48
|
-
|
49
|
-
|
26
|
+
when :number
|
27
|
+
while @pos < @tokens.size and !@tokens[@pos].number?
|
28
|
+
@pos+= 1
|
50
29
|
end
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
@pos
|
30
|
+
when :alphanum
|
31
|
+
while @pos < @tokens.size and !@tokens[@pos].number? and !@tokens[@pos].word?
|
32
|
+
@pos+= 1
|
55
33
|
end
|
34
|
+
end
|
35
|
+
end
|
56
36
|
|
57
37
|
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
38
|
+
def current
|
39
|
+
if @pos == @tokens.size
|
40
|
+
nil
|
41
|
+
else
|
42
|
+
@tokens[@pos]
|
43
|
+
end
|
44
|
+
end
|
62
45
|
|
63
|
-
|
46
|
+
def rewind
|
47
|
+
@pos = 0
|
48
|
+
end
|
64
49
|
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
flattened
|
69
|
-
end
|
50
|
+
def index
|
51
|
+
@pos
|
52
|
+
end
|
70
53
|
|
71
|
-
end
|
54
|
+
def end?
|
55
|
+
@pos == tokens.size
|
56
|
+
end
|
57
|
+
|
58
|
+
|
59
|
+
end
|
72
60
|
end
|