nlp 0.2.6 → 0.2.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/dict/liwc +11 -12
- data/dict/rid +7 -7
- data/lib/nlp.rb +27 -5
- data/lib/stdlib/ext/array.rb +1 -0
- data/lib/text_statistics.rb +53 -0
- metadata +42 -55
- data/lib/analyzer.rb +0 -50
- data/lib/category.rb +0 -27
- data/lib/dictionary.rb +0 -85
- data/lib/emoticon.rb +0 -14
- data/lib/inflectable.rb +0 -60
- data/lib/lemmatizer.rb +0 -112
- data/lib/liwc_analyzer.rb +0 -74
- data/lib/liwc_category.rb +0 -61
- data/lib/meaningable.rb +0 -69
- data/lib/rid_analyzer.rb +0 -10
- data/lib/rid_category.rb +0 -17
- data/lib/sentence.rb +0 -24
- data/lib/statistic.rb +0 -55
- data/lib/stdlib/ext/string.rb +0 -19
- data/lib/stree.rb +0 -85
- data/lib/takipi_web_service.rb +0 -51
- data/lib/text.rb +0 -26
- data/lib/token.rb +0 -37
- data/lib/token_scanner.rb +0 -60
- data/lib/word.rb +0 -23
- data/test/analyzer_test.rb +0 -25
- data/test/helper.rb +0 -9
- data/test/lemmatizer_test.rb +0 -73
- data/test/meaningable_test.rb +0 -28
- data/test/nlp_test_suite.rb +0 -11
- data/test/sentence_test.rb +0 -26
- data/test/test_nlp.rb +0 -7
- data/test/text_test.rb +0 -29
- data/test/token_scanner_test.rb +0 -28
- data/test/token_test.rb +0 -37
- data/test/word_test.rb +0 -45
data/lib/rid_analyzer.rb
DELETED
data/lib/rid_category.rb
DELETED
data/lib/sentence.rb
DELETED
@@ -1,24 +0,0 @@
|
|
1
|
-
module NLP
|
2
|
-
class Sentence
|
3
|
-
|
4
|
-
attr_reader :tokens
|
5
|
-
|
6
|
-
def initialize()
|
7
|
-
@tokens = []
|
8
|
-
end
|
9
|
-
|
10
|
-
def << tokens
|
11
|
-
if tokens.is_a? Array
|
12
|
-
@tokens.concat tokens
|
13
|
-
else
|
14
|
-
@tokens << tokens
|
15
|
-
end
|
16
|
-
self
|
17
|
-
end
|
18
|
-
|
19
|
-
def words_number
|
20
|
-
@tokens.count{|t| !t.interp?}
|
21
|
-
end
|
22
|
-
|
23
|
-
end
|
24
|
-
end
|
data/lib/statistic.rb
DELETED
@@ -1,55 +0,0 @@
|
|
1
|
-
class Statistic
|
2
|
-
|
3
|
-
attr_accessor :total_words, :hash
|
4
|
-
attr_reader :cwords, :words, :total_words, :word_count
|
5
|
-
|
6
|
-
def initialize
|
7
|
-
@word_count = 0
|
8
|
-
@total_words = 0
|
9
|
-
@scores = Hash.new { 0 }
|
10
|
-
@words = []
|
11
|
-
@cwords = Hash.new {nil}
|
12
|
-
@hash
|
13
|
-
end
|
14
|
-
|
15
|
-
def add(word,category)
|
16
|
-
|
17
|
-
@scores[category] += 1
|
18
|
-
@word_count += 1
|
19
|
-
@words.push word
|
20
|
-
|
21
|
-
category = category.name
|
22
|
-
if @cwords[category].nil?
|
23
|
-
@cwords[category] = []
|
24
|
-
end
|
25
|
-
@cwords[category].push word
|
26
|
-
|
27
|
-
end
|
28
|
-
|
29
|
-
def []=(key,value)
|
30
|
-
@hash[key] = value
|
31
|
-
end
|
32
|
-
|
33
|
-
def [](key)
|
34
|
-
@hash[key]
|
35
|
-
end
|
36
|
-
|
37
|
-
def category_participation(categories)
|
38
|
-
sorted_scores = @scores.to_a.sort_by { |result| -result[1] }
|
39
|
-
r = {}
|
40
|
-
categories.each do |cat|
|
41
|
-
r[cat] = percentage_distribution(sorted_scores){|c| c.send(cat.to_s+'?')}
|
42
|
-
end
|
43
|
-
r
|
44
|
-
end
|
45
|
-
|
46
|
-
private
|
47
|
-
|
48
|
-
def percentage_distribution scores, &block
|
49
|
-
sum = scores.select{|result| yield result[0]}.inject(0){|count,result| count + result[1]}
|
50
|
-
Float(sum)/@word_count
|
51
|
-
end
|
52
|
-
|
53
|
-
end
|
54
|
-
|
55
|
-
|
data/lib/stdlib/ext/string.rb
DELETED
@@ -1,19 +0,0 @@
|
|
1
|
-
class String
|
2
|
-
alias old_memeber []
|
3
|
-
|
4
|
-
def ordinary (index)
|
5
|
-
self.old_memeber index
|
6
|
-
end
|
7
|
-
|
8
|
-
def get(index)
|
9
|
-
self.scan(/./)[index]
|
10
|
-
end
|
11
|
-
|
12
|
-
def set(index,value)
|
13
|
-
arr = self.scan(/./)
|
14
|
-
arr[index] = value
|
15
|
-
self.replace(arr.join)
|
16
|
-
value
|
17
|
-
end
|
18
|
-
end
|
19
|
-
|
data/lib/stree.rb
DELETED
@@ -1,85 +0,0 @@
|
|
1
|
-
|
2
|
-
module NLP
|
3
|
-
class SearchTree
|
4
|
-
ALPHABET = %w{* - a ą b c ć d e ę f g h i j k l ł m n ń o ó p r s ś t u w y z ź ż}
|
5
|
-
SYMBOLS = %w{* - : - / ) (}
|
6
|
-
attr_accessor :value
|
7
|
-
attr_accessor :subtrees
|
8
|
-
|
9
|
-
# 0 -> *
|
10
|
-
# 1 -> -
|
11
|
-
# 2 -> a
|
12
|
-
# 33 -> ź
|
13
|
-
def initialize
|
14
|
-
@subtrees = Array.new(34, nil)
|
15
|
-
@value = []
|
16
|
-
end
|
17
|
-
|
18
|
-
def insert(s, value)
|
19
|
-
priv_insert(s.scan(/./), value)
|
20
|
-
end
|
21
|
-
|
22
|
-
def find(s)
|
23
|
-
priv_find(s.scan(/./))
|
24
|
-
end
|
25
|
-
|
26
|
-
|
27
|
-
protected
|
28
|
-
def key( chr )
|
29
|
-
unless chr
|
30
|
-
raise ArgumentError, "Argument chr is nil"
|
31
|
-
end
|
32
|
-
rval = ALPHABET.index(chr) || -1
|
33
|
-
if rval > 35
|
34
|
-
rval = -1 # invalid character
|
35
|
-
end
|
36
|
-
|
37
|
-
rval
|
38
|
-
end
|
39
|
-
|
40
|
-
def priv_insert( s, value )
|
41
|
-
if s.empty?
|
42
|
-
@value.push value
|
43
|
-
else
|
44
|
-
index = key( s.first )
|
45
|
-
subtree = if @subtrees[index] == nil
|
46
|
-
@subtrees[index] = SearchTree.new
|
47
|
-
else
|
48
|
-
@subtrees[index]
|
49
|
-
end
|
50
|
-
|
51
|
-
subtree.priv_insert( s.tail, value )
|
52
|
-
end
|
53
|
-
end
|
54
|
-
|
55
|
-
def priv_find( search )
|
56
|
-
if @subtrees[0]
|
57
|
-
@subtrees[0].value
|
58
|
-
else
|
59
|
-
if search.empty?
|
60
|
-
value
|
61
|
-
else
|
62
|
-
index = key( search.first )
|
63
|
-
if @subtrees[index]
|
64
|
-
@subtrees[index].priv_find( search.tail )
|
65
|
-
else
|
66
|
-
nil
|
67
|
-
end
|
68
|
-
end
|
69
|
-
end
|
70
|
-
end
|
71
|
-
|
72
|
-
public
|
73
|
-
def traverse()
|
74
|
-
list = []
|
75
|
-
yield @value
|
76
|
-
list.concat @subrees if @subtrees != nil
|
77
|
-
loop do
|
78
|
-
break if list.empty?
|
79
|
-
node = list.shift
|
80
|
-
yield node.value
|
81
|
-
list.concat node.subtrees if node.subtrees != nil
|
82
|
-
end
|
83
|
-
end
|
84
|
-
end
|
85
|
-
end
|
data/lib/takipi_web_service.rb
DELETED
@@ -1,51 +0,0 @@
|
|
1
|
-
require 'rubygems'
|
2
|
-
require 'savon'
|
3
|
-
|
4
|
-
class TakipiWebService
|
5
|
-
URL = 'http://nlp.pwr.wroc.pl/clarin/ws/takipi/'
|
6
|
-
WSDL_URL = URL + 'takipi.wsdl'
|
7
|
-
|
8
|
-
def self.request(text)
|
9
|
-
client = Savon::Client.new WSDL_URL, :soap_endpoint => URL
|
10
|
-
|
11
|
-
# Call remote service methods
|
12
|
-
response = client.tag do |soap|
|
13
|
-
soap.body = "<text>#{text}</text><format>TXT</format><useGuesser>true</useGuesser>"
|
14
|
-
end
|
15
|
-
|
16
|
-
response = response.to_hash
|
17
|
-
token = response[:tag_response][:tag_response][:msg]
|
18
|
-
status = (response[:tag_response][:tag_response][:status]).to_i
|
19
|
-
|
20
|
-
#checking status
|
21
|
-
timeout = 60
|
22
|
-
step = 5
|
23
|
-
count = 0
|
24
|
-
loop do
|
25
|
-
break if count > timeout
|
26
|
-
if status == 1
|
27
|
-
break
|
28
|
-
elsif status == 2 or status == 3
|
29
|
-
count += 5
|
30
|
-
sleep(1)
|
31
|
-
r = client.get_status do |soap|
|
32
|
-
soap.body = "<token>#{token}</token>"
|
33
|
-
end.to_hash
|
34
|
-
status = (r[:get_status_response][:status]).to_i
|
35
|
-
|
36
|
-
end
|
37
|
-
end
|
38
|
-
|
39
|
-
#geting result
|
40
|
-
|
41
|
-
result = client.get_result do |soap|
|
42
|
-
soap.body="<token>#{token}</token>"
|
43
|
-
end
|
44
|
-
|
45
|
-
response_document = result.to_hash[:get_result_response][:tag_response][:msg]
|
46
|
-
|
47
|
-
#transforming response to well formed xml string
|
48
|
-
return "<xml><chunkList>#{response_document}</chunkList></xml>"
|
49
|
-
end
|
50
|
-
end
|
51
|
-
|
data/lib/text.rb
DELETED
@@ -1,26 +0,0 @@
|
|
1
|
-
module NLP
|
2
|
-
class Text
|
3
|
-
attr_reader :sentences
|
4
|
-
|
5
|
-
def initialize
|
6
|
-
@sentences = []
|
7
|
-
end
|
8
|
-
|
9
|
-
def << sentence
|
10
|
-
@sentences.push sentence
|
11
|
-
end
|
12
|
-
|
13
|
-
|
14
|
-
def words_per_sentence
|
15
|
-
@sentences.collect{|s| s.words_number}.mean
|
16
|
-
end
|
17
|
-
|
18
|
-
|
19
|
-
def flatten
|
20
|
-
flattened = []
|
21
|
-
@sentences.each{ |s| s.tokens.each{|t| flattened.push t } }
|
22
|
-
flattened
|
23
|
-
end
|
24
|
-
|
25
|
-
end
|
26
|
-
end
|
data/lib/token.rb
DELETED
@@ -1,37 +0,0 @@
|
|
1
|
-
module NLP
|
2
|
-
class Token
|
3
|
-
|
4
|
-
attr_reader :orth
|
5
|
-
attr_reader :tags
|
6
|
-
|
7
|
-
def initialize(orth,tags)
|
8
|
-
@orth = orth
|
9
|
-
@tags = tags
|
10
|
-
end
|
11
|
-
|
12
|
-
def symbol?
|
13
|
-
@tags.eql? "tsym"
|
14
|
-
end
|
15
|
-
|
16
|
-
def interp?
|
17
|
-
@tags.eql? "interp"
|
18
|
-
end
|
19
|
-
|
20
|
-
def word?
|
21
|
-
not interp? and not number?
|
22
|
-
end
|
23
|
-
|
24
|
-
def number?
|
25
|
-
@tags.include?("tnum")
|
26
|
-
end
|
27
|
-
|
28
|
-
def integer?
|
29
|
-
@tags.include?("tnum:integer")
|
30
|
-
end
|
31
|
-
|
32
|
-
def float?
|
33
|
-
@tags.include?("tnum:frac")
|
34
|
-
end
|
35
|
-
|
36
|
-
end
|
37
|
-
end
|
data/lib/token_scanner.rb
DELETED
@@ -1,60 +0,0 @@
|
|
1
|
-
module NLP
|
2
|
-
class TokenScanner
|
3
|
-
|
4
|
-
attr_reader :text, :tokens
|
5
|
-
|
6
|
-
def initialize(text)
|
7
|
-
@text = text
|
8
|
-
@pos = 0
|
9
|
-
@tokens = @text.flatten
|
10
|
-
end
|
11
|
-
|
12
|
-
def next(type)
|
13
|
-
@pos+=1
|
14
|
-
|
15
|
-
case type
|
16
|
-
when :word
|
17
|
-
while @pos < @tokens.size and !@tokens[@pos].word?
|
18
|
-
@pos+= 1
|
19
|
-
end
|
20
|
-
|
21
|
-
when :interp
|
22
|
-
while @pos < @tokens.size and !@tokens[@pos].interp?
|
23
|
-
@pos+= 1
|
24
|
-
end
|
25
|
-
|
26
|
-
when :number
|
27
|
-
while @pos < @tokens.size and !@tokens[@pos].number?
|
28
|
-
@pos+= 1
|
29
|
-
end
|
30
|
-
when :alphanum
|
31
|
-
while @pos < @tokens.size and !@tokens[@pos].number? and !@tokens[@pos].word?
|
32
|
-
@pos+= 1
|
33
|
-
end
|
34
|
-
end
|
35
|
-
end
|
36
|
-
|
37
|
-
|
38
|
-
def current
|
39
|
-
if @pos == @tokens.size
|
40
|
-
nil
|
41
|
-
else
|
42
|
-
@tokens[@pos]
|
43
|
-
end
|
44
|
-
end
|
45
|
-
|
46
|
-
def rewind
|
47
|
-
@pos = 0
|
48
|
-
end
|
49
|
-
|
50
|
-
def index
|
51
|
-
@pos
|
52
|
-
end
|
53
|
-
|
54
|
-
def end?
|
55
|
-
@pos == tokens.size
|
56
|
-
end
|
57
|
-
|
58
|
-
|
59
|
-
end
|
60
|
-
end
|
data/lib/word.rb
DELETED
@@ -1,23 +0,0 @@
|
|
1
|
-
require 'inflectable'
|
2
|
-
require 'meaningable'
|
3
|
-
|
4
|
-
module NLP
|
5
|
-
class Word < Token
|
6
|
-
|
7
|
-
include Inflectable
|
8
|
-
include Meaningable
|
9
|
-
|
10
|
-
attr_reader :lemat
|
11
|
-
attr_accessor :category
|
12
|
-
|
13
|
-
def initialize(word, lemat, tags)
|
14
|
-
super(word,tags)
|
15
|
-
@lemat = lemat
|
16
|
-
end
|
17
|
-
|
18
|
-
def inflection
|
19
|
-
@tags
|
20
|
-
end
|
21
|
-
|
22
|
-
end
|
23
|
-
end
|
data/test/analyzer_test.rb
DELETED
@@ -1,25 +0,0 @@
|
|
1
|
-
require '../lib/analyzer.rb'
|
2
|
-
|
3
|
-
|
4
|
-
class AnalyzerTest < Test::Unit::TestCase
|
5
|
-
|
6
|
-
include NLP
|
7
|
-
|
8
|
-
def setup
|
9
|
-
sample = "Ja byłam wtedy bardzo szczęśliwa"
|
10
|
-
@text = Lemmatizer.lemmatize(sample,:takipi,:local)
|
11
|
-
@scanner = TokenScanner.new(@text)
|
12
|
-
@rid_analyzer = Analyzer.new(:rid)
|
13
|
-
@liwc_analyzer = Analyzer.new(:liwc)
|
14
|
-
end
|
15
|
-
|
16
|
-
def test_analyze
|
17
|
-
stats = @rid_analyzer.analyze(@scanner)
|
18
|
-
assert_kind_of Statistic, stats
|
19
|
-
assert_equal 5, stats.total_words
|
20
|
-
assert_equal 1, stats.word_count
|
21
|
-
|
22
|
-
end
|
23
|
-
|
24
|
-
end
|
25
|
-
|