nlp 0.2.6 → 0.2.7
Sign up to get free protection for your applications and to get access to all the features.
- data/dict/liwc +11 -12
- data/dict/rid +7 -7
- data/lib/nlp.rb +27 -5
- data/lib/stdlib/ext/array.rb +1 -0
- data/lib/text_statistics.rb +53 -0
- metadata +42 -55
- data/lib/analyzer.rb +0 -50
- data/lib/category.rb +0 -27
- data/lib/dictionary.rb +0 -85
- data/lib/emoticon.rb +0 -14
- data/lib/inflectable.rb +0 -60
- data/lib/lemmatizer.rb +0 -112
- data/lib/liwc_analyzer.rb +0 -74
- data/lib/liwc_category.rb +0 -61
- data/lib/meaningable.rb +0 -69
- data/lib/rid_analyzer.rb +0 -10
- data/lib/rid_category.rb +0 -17
- data/lib/sentence.rb +0 -24
- data/lib/statistic.rb +0 -55
- data/lib/stdlib/ext/string.rb +0 -19
- data/lib/stree.rb +0 -85
- data/lib/takipi_web_service.rb +0 -51
- data/lib/text.rb +0 -26
- data/lib/token.rb +0 -37
- data/lib/token_scanner.rb +0 -60
- data/lib/word.rb +0 -23
- data/test/analyzer_test.rb +0 -25
- data/test/helper.rb +0 -9
- data/test/lemmatizer_test.rb +0 -73
- data/test/meaningable_test.rb +0 -28
- data/test/nlp_test_suite.rb +0 -11
- data/test/sentence_test.rb +0 -26
- data/test/test_nlp.rb +0 -7
- data/test/text_test.rb +0 -29
- data/test/token_scanner_test.rb +0 -28
- data/test/token_test.rb +0 -37
- data/test/word_test.rb +0 -45
data/lib/rid_analyzer.rb
DELETED
data/lib/rid_category.rb
DELETED
data/lib/sentence.rb
DELETED
@@ -1,24 +0,0 @@
|
|
1
|
-
module NLP
|
2
|
-
class Sentence
|
3
|
-
|
4
|
-
attr_reader :tokens
|
5
|
-
|
6
|
-
def initialize()
|
7
|
-
@tokens = []
|
8
|
-
end
|
9
|
-
|
10
|
-
def << tokens
|
11
|
-
if tokens.is_a? Array
|
12
|
-
@tokens.concat tokens
|
13
|
-
else
|
14
|
-
@tokens << tokens
|
15
|
-
end
|
16
|
-
self
|
17
|
-
end
|
18
|
-
|
19
|
-
def words_number
|
20
|
-
@tokens.count{|t| !t.interp?}
|
21
|
-
end
|
22
|
-
|
23
|
-
end
|
24
|
-
end
|
data/lib/statistic.rb
DELETED
@@ -1,55 +0,0 @@
|
|
1
|
-
class Statistic
|
2
|
-
|
3
|
-
attr_accessor :total_words, :hash
|
4
|
-
attr_reader :cwords, :words, :total_words, :word_count
|
5
|
-
|
6
|
-
def initialize
|
7
|
-
@word_count = 0
|
8
|
-
@total_words = 0
|
9
|
-
@scores = Hash.new { 0 }
|
10
|
-
@words = []
|
11
|
-
@cwords = Hash.new {nil}
|
12
|
-
@hash
|
13
|
-
end
|
14
|
-
|
15
|
-
def add(word,category)
|
16
|
-
|
17
|
-
@scores[category] += 1
|
18
|
-
@word_count += 1
|
19
|
-
@words.push word
|
20
|
-
|
21
|
-
category = category.name
|
22
|
-
if @cwords[category].nil?
|
23
|
-
@cwords[category] = []
|
24
|
-
end
|
25
|
-
@cwords[category].push word
|
26
|
-
|
27
|
-
end
|
28
|
-
|
29
|
-
def []=(key,value)
|
30
|
-
@hash[key] = value
|
31
|
-
end
|
32
|
-
|
33
|
-
def [](key)
|
34
|
-
@hash[key]
|
35
|
-
end
|
36
|
-
|
37
|
-
def category_participation(categories)
|
38
|
-
sorted_scores = @scores.to_a.sort_by { |result| -result[1] }
|
39
|
-
r = {}
|
40
|
-
categories.each do |cat|
|
41
|
-
r[cat] = percentage_distribution(sorted_scores){|c| c.send(cat.to_s+'?')}
|
42
|
-
end
|
43
|
-
r
|
44
|
-
end
|
45
|
-
|
46
|
-
private
|
47
|
-
|
48
|
-
def percentage_distribution scores, &block
|
49
|
-
sum = scores.select{|result| yield result[0]}.inject(0){|count,result| count + result[1]}
|
50
|
-
Float(sum)/@word_count
|
51
|
-
end
|
52
|
-
|
53
|
-
end
|
54
|
-
|
55
|
-
|
data/lib/stdlib/ext/string.rb
DELETED
@@ -1,19 +0,0 @@
|
|
1
|
-
class String
|
2
|
-
alias old_memeber []
|
3
|
-
|
4
|
-
def ordinary (index)
|
5
|
-
self.old_memeber index
|
6
|
-
end
|
7
|
-
|
8
|
-
def get(index)
|
9
|
-
self.scan(/./)[index]
|
10
|
-
end
|
11
|
-
|
12
|
-
def set(index,value)
|
13
|
-
arr = self.scan(/./)
|
14
|
-
arr[index] = value
|
15
|
-
self.replace(arr.join)
|
16
|
-
value
|
17
|
-
end
|
18
|
-
end
|
19
|
-
|
data/lib/stree.rb
DELETED
@@ -1,85 +0,0 @@
|
|
1
|
-
|
2
|
-
module NLP
|
3
|
-
class SearchTree
|
4
|
-
ALPHABET = %w{* - a ą b c ć d e ę f g h i j k l ł m n ń o ó p r s ś t u w y z ź ż}
|
5
|
-
SYMBOLS = %w{* - : - / ) (}
|
6
|
-
attr_accessor :value
|
7
|
-
attr_accessor :subtrees
|
8
|
-
|
9
|
-
# 0 -> *
|
10
|
-
# 1 -> -
|
11
|
-
# 2 -> a
|
12
|
-
# 33 -> ź
|
13
|
-
def initialize
|
14
|
-
@subtrees = Array.new(34, nil)
|
15
|
-
@value = []
|
16
|
-
end
|
17
|
-
|
18
|
-
def insert(s, value)
|
19
|
-
priv_insert(s.scan(/./), value)
|
20
|
-
end
|
21
|
-
|
22
|
-
def find(s)
|
23
|
-
priv_find(s.scan(/./))
|
24
|
-
end
|
25
|
-
|
26
|
-
|
27
|
-
protected
|
28
|
-
def key( chr )
|
29
|
-
unless chr
|
30
|
-
raise ArgumentError, "Argument chr is nil"
|
31
|
-
end
|
32
|
-
rval = ALPHABET.index(chr) || -1
|
33
|
-
if rval > 35
|
34
|
-
rval = -1 # invalid character
|
35
|
-
end
|
36
|
-
|
37
|
-
rval
|
38
|
-
end
|
39
|
-
|
40
|
-
def priv_insert( s, value )
|
41
|
-
if s.empty?
|
42
|
-
@value.push value
|
43
|
-
else
|
44
|
-
index = key( s.first )
|
45
|
-
subtree = if @subtrees[index] == nil
|
46
|
-
@subtrees[index] = SearchTree.new
|
47
|
-
else
|
48
|
-
@subtrees[index]
|
49
|
-
end
|
50
|
-
|
51
|
-
subtree.priv_insert( s.tail, value )
|
52
|
-
end
|
53
|
-
end
|
54
|
-
|
55
|
-
def priv_find( search )
|
56
|
-
if @subtrees[0]
|
57
|
-
@subtrees[0].value
|
58
|
-
else
|
59
|
-
if search.empty?
|
60
|
-
value
|
61
|
-
else
|
62
|
-
index = key( search.first )
|
63
|
-
if @subtrees[index]
|
64
|
-
@subtrees[index].priv_find( search.tail )
|
65
|
-
else
|
66
|
-
nil
|
67
|
-
end
|
68
|
-
end
|
69
|
-
end
|
70
|
-
end
|
71
|
-
|
72
|
-
public
|
73
|
-
def traverse()
|
74
|
-
list = []
|
75
|
-
yield @value
|
76
|
-
list.concat @subrees if @subtrees != nil
|
77
|
-
loop do
|
78
|
-
break if list.empty?
|
79
|
-
node = list.shift
|
80
|
-
yield node.value
|
81
|
-
list.concat node.subtrees if node.subtrees != nil
|
82
|
-
end
|
83
|
-
end
|
84
|
-
end
|
85
|
-
end
|
data/lib/takipi_web_service.rb
DELETED
@@ -1,51 +0,0 @@
|
|
1
|
-
require 'rubygems'
|
2
|
-
require 'savon'
|
3
|
-
|
4
|
-
class TakipiWebService
|
5
|
-
URL = 'http://nlp.pwr.wroc.pl/clarin/ws/takipi/'
|
6
|
-
WSDL_URL = URL + 'takipi.wsdl'
|
7
|
-
|
8
|
-
def self.request(text)
|
9
|
-
client = Savon::Client.new WSDL_URL, :soap_endpoint => URL
|
10
|
-
|
11
|
-
# Call remote service methods
|
12
|
-
response = client.tag do |soap|
|
13
|
-
soap.body = "<text>#{text}</text><format>TXT</format><useGuesser>true</useGuesser>"
|
14
|
-
end
|
15
|
-
|
16
|
-
response = response.to_hash
|
17
|
-
token = response[:tag_response][:tag_response][:msg]
|
18
|
-
status = (response[:tag_response][:tag_response][:status]).to_i
|
19
|
-
|
20
|
-
#checking status
|
21
|
-
timeout = 60
|
22
|
-
step = 5
|
23
|
-
count = 0
|
24
|
-
loop do
|
25
|
-
break if count > timeout
|
26
|
-
if status == 1
|
27
|
-
break
|
28
|
-
elsif status == 2 or status == 3
|
29
|
-
count += 5
|
30
|
-
sleep(1)
|
31
|
-
r = client.get_status do |soap|
|
32
|
-
soap.body = "<token>#{token}</token>"
|
33
|
-
end.to_hash
|
34
|
-
status = (r[:get_status_response][:status]).to_i
|
35
|
-
|
36
|
-
end
|
37
|
-
end
|
38
|
-
|
39
|
-
#geting result
|
40
|
-
|
41
|
-
result = client.get_result do |soap|
|
42
|
-
soap.body="<token>#{token}</token>"
|
43
|
-
end
|
44
|
-
|
45
|
-
response_document = result.to_hash[:get_result_response][:tag_response][:msg]
|
46
|
-
|
47
|
-
#transforming response to well formed xml string
|
48
|
-
return "<xml><chunkList>#{response_document}</chunkList></xml>"
|
49
|
-
end
|
50
|
-
end
|
51
|
-
|
data/lib/text.rb
DELETED
@@ -1,26 +0,0 @@
|
|
1
|
-
module NLP
|
2
|
-
class Text
|
3
|
-
attr_reader :sentences
|
4
|
-
|
5
|
-
def initialize
|
6
|
-
@sentences = []
|
7
|
-
end
|
8
|
-
|
9
|
-
def << sentence
|
10
|
-
@sentences.push sentence
|
11
|
-
end
|
12
|
-
|
13
|
-
|
14
|
-
def words_per_sentence
|
15
|
-
@sentences.collect{|s| s.words_number}.mean
|
16
|
-
end
|
17
|
-
|
18
|
-
|
19
|
-
def flatten
|
20
|
-
flattened = []
|
21
|
-
@sentences.each{ |s| s.tokens.each{|t| flattened.push t } }
|
22
|
-
flattened
|
23
|
-
end
|
24
|
-
|
25
|
-
end
|
26
|
-
end
|
data/lib/token.rb
DELETED
@@ -1,37 +0,0 @@
|
|
1
|
-
module NLP
|
2
|
-
class Token
|
3
|
-
|
4
|
-
attr_reader :orth
|
5
|
-
attr_reader :tags
|
6
|
-
|
7
|
-
def initialize(orth,tags)
|
8
|
-
@orth = orth
|
9
|
-
@tags = tags
|
10
|
-
end
|
11
|
-
|
12
|
-
def symbol?
|
13
|
-
@tags.eql? "tsym"
|
14
|
-
end
|
15
|
-
|
16
|
-
def interp?
|
17
|
-
@tags.eql? "interp"
|
18
|
-
end
|
19
|
-
|
20
|
-
def word?
|
21
|
-
not interp? and not number?
|
22
|
-
end
|
23
|
-
|
24
|
-
def number?
|
25
|
-
@tags.include?("tnum")
|
26
|
-
end
|
27
|
-
|
28
|
-
def integer?
|
29
|
-
@tags.include?("tnum:integer")
|
30
|
-
end
|
31
|
-
|
32
|
-
def float?
|
33
|
-
@tags.include?("tnum:frac")
|
34
|
-
end
|
35
|
-
|
36
|
-
end
|
37
|
-
end
|
data/lib/token_scanner.rb
DELETED
@@ -1,60 +0,0 @@
|
|
1
|
-
module NLP
|
2
|
-
class TokenScanner
|
3
|
-
|
4
|
-
attr_reader :text, :tokens
|
5
|
-
|
6
|
-
def initialize(text)
|
7
|
-
@text = text
|
8
|
-
@pos = 0
|
9
|
-
@tokens = @text.flatten
|
10
|
-
end
|
11
|
-
|
12
|
-
def next(type)
|
13
|
-
@pos+=1
|
14
|
-
|
15
|
-
case type
|
16
|
-
when :word
|
17
|
-
while @pos < @tokens.size and !@tokens[@pos].word?
|
18
|
-
@pos+= 1
|
19
|
-
end
|
20
|
-
|
21
|
-
when :interp
|
22
|
-
while @pos < @tokens.size and !@tokens[@pos].interp?
|
23
|
-
@pos+= 1
|
24
|
-
end
|
25
|
-
|
26
|
-
when :number
|
27
|
-
while @pos < @tokens.size and !@tokens[@pos].number?
|
28
|
-
@pos+= 1
|
29
|
-
end
|
30
|
-
when :alphanum
|
31
|
-
while @pos < @tokens.size and !@tokens[@pos].number? and !@tokens[@pos].word?
|
32
|
-
@pos+= 1
|
33
|
-
end
|
34
|
-
end
|
35
|
-
end
|
36
|
-
|
37
|
-
|
38
|
-
def current
|
39
|
-
if @pos == @tokens.size
|
40
|
-
nil
|
41
|
-
else
|
42
|
-
@tokens[@pos]
|
43
|
-
end
|
44
|
-
end
|
45
|
-
|
46
|
-
def rewind
|
47
|
-
@pos = 0
|
48
|
-
end
|
49
|
-
|
50
|
-
def index
|
51
|
-
@pos
|
52
|
-
end
|
53
|
-
|
54
|
-
def end?
|
55
|
-
@pos == tokens.size
|
56
|
-
end
|
57
|
-
|
58
|
-
|
59
|
-
end
|
60
|
-
end
|
data/lib/word.rb
DELETED
@@ -1,23 +0,0 @@
|
|
1
|
-
require 'inflectable'
|
2
|
-
require 'meaningable'
|
3
|
-
|
4
|
-
module NLP
|
5
|
-
class Word < Token
|
6
|
-
|
7
|
-
include Inflectable
|
8
|
-
include Meaningable
|
9
|
-
|
10
|
-
attr_reader :lemat
|
11
|
-
attr_accessor :category
|
12
|
-
|
13
|
-
def initialize(word, lemat, tags)
|
14
|
-
super(word,tags)
|
15
|
-
@lemat = lemat
|
16
|
-
end
|
17
|
-
|
18
|
-
def inflection
|
19
|
-
@tags
|
20
|
-
end
|
21
|
-
|
22
|
-
end
|
23
|
-
end
|
data/test/analyzer_test.rb
DELETED
@@ -1,25 +0,0 @@
|
|
1
|
-
require '../lib/analyzer.rb'
|
2
|
-
|
3
|
-
|
4
|
-
class AnalyzerTest < Test::Unit::TestCase
|
5
|
-
|
6
|
-
include NLP
|
7
|
-
|
8
|
-
def setup
|
9
|
-
sample = "Ja byłam wtedy bardzo szczęśliwa"
|
10
|
-
@text = Lemmatizer.lemmatize(sample,:takipi,:local)
|
11
|
-
@scanner = TokenScanner.new(@text)
|
12
|
-
@rid_analyzer = Analyzer.new(:rid)
|
13
|
-
@liwc_analyzer = Analyzer.new(:liwc)
|
14
|
-
end
|
15
|
-
|
16
|
-
def test_analyze
|
17
|
-
stats = @rid_analyzer.analyze(@scanner)
|
18
|
-
assert_kind_of Statistic, stats
|
19
|
-
assert_equal 5, stats.total_words
|
20
|
-
assert_equal 1, stats.word_count
|
21
|
-
|
22
|
-
end
|
23
|
-
|
24
|
-
end
|
25
|
-
|