nlp 0.2.5 → 0.2.6
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/analyzer.rb +15 -47
- data/lib/category.rb +7 -8
- data/lib/dictionary.rb +30 -28
- data/lib/emoticon.rb +8 -8
- data/lib/inflectable.rb +58 -59
- data/lib/lemmatizer.rb +86 -82
- data/lib/liwc_analyzer.rb +68 -91
- data/lib/liwc_category.rb +42 -43
- data/lib/meaningable.rb +44 -51
- data/lib/nlp.rb +10 -0
- data/lib/rid_analyzer.rb +5 -69
- data/lib/rid_category.rb +5 -6
- data/lib/sentence.rb +19 -11
- data/lib/statistic.rb +55 -0
- data/lib/stdlib/ext/array.rb +7 -0
- data/lib/stree.rb +39 -39
- data/lib/takipi_web_service.rb +45 -45
- data/lib/text.rb +18 -17
- data/lib/token.rb +28 -25
- data/lib/token_scanner.rb +43 -55
- data/lib/word.rb +14 -14
- data/test/analyzer_test.rb +25 -0
- data/test/lemmatizer_test.rb +73 -0
- data/test/meaningable_test.rb +28 -0
- data/test/nlp_test_suite.rb +11 -0
- data/test/sentence_test.rb +26 -0
- data/test/text_test.rb +29 -0
- data/test/token_scanner_test.rb +28 -0
- data/test/token_test.rb +37 -0
- data/test/word_test.rb +39 -36
- metadata +21 -5
- data/lib/takipi_web_service +0 -0
data/lib/word.rb
CHANGED
@@ -2,22 +2,22 @@ require 'inflectable'
|
|
2
2
|
require 'meaningable'
|
3
3
|
|
4
4
|
module NLP
|
5
|
-
|
6
|
-
|
7
|
-
|
5
|
+
class Word < Token
|
6
|
+
|
7
|
+
include Inflectable
|
8
|
+
include Meaningable
|
8
9
|
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
def initialize(word, lemat, tags)
|
13
|
-
super(word,tags)
|
14
|
-
@lemat = lemat
|
15
|
-
end
|
16
|
-
|
17
|
-
def inflection
|
18
|
-
@tags
|
19
|
-
end
|
10
|
+
attr_reader :lemat
|
11
|
+
attr_accessor :category
|
20
12
|
|
13
|
+
def initialize(word, lemat, tags)
|
14
|
+
super(word,tags)
|
15
|
+
@lemat = lemat
|
16
|
+
end
|
21
17
|
|
18
|
+
def inflection
|
19
|
+
@tags
|
22
20
|
end
|
21
|
+
|
22
|
+
end
|
23
23
|
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
require '../lib/analyzer.rb'
|
2
|
+
|
3
|
+
|
4
|
+
class AnalyzerTest < Test::Unit::TestCase
|
5
|
+
|
6
|
+
include NLP
|
7
|
+
|
8
|
+
def setup
|
9
|
+
sample = "Ja byłam wtedy bardzo szczęśliwa"
|
10
|
+
@text = Lemmatizer.lemmatize(sample,:takipi,:local)
|
11
|
+
@scanner = TokenScanner.new(@text)
|
12
|
+
@rid_analyzer = Analyzer.new(:rid)
|
13
|
+
@liwc_analyzer = Analyzer.new(:liwc)
|
14
|
+
end
|
15
|
+
|
16
|
+
def test_analyze
|
17
|
+
stats = @rid_analyzer.analyze(@scanner)
|
18
|
+
assert_kind_of Statistic, stats
|
19
|
+
assert_equal 5, stats.total_words
|
20
|
+
assert_equal 1, stats.word_count
|
21
|
+
|
22
|
+
end
|
23
|
+
|
24
|
+
end
|
25
|
+
|
@@ -0,0 +1,73 @@
|
|
1
|
+
require 'helper'
|
2
|
+
require '../lib/lemmatizer.rb'
|
3
|
+
|
4
|
+
class LemmatizerTest < Test::Unit::TestCase
|
5
|
+
|
6
|
+
include NLP
|
7
|
+
|
8
|
+
def setup
|
9
|
+
@sample = "Złe czasy już minęły."
|
10
|
+
|
11
|
+
|
12
|
+
@zle_word = Word.new('złe','zły','adj:pl:nom:m3:pos')
|
13
|
+
@czasy_word = Word.new('czasy','czas','subst:pl:nom:m3')
|
14
|
+
@minely_word = Word.new('minęły','minąć','praet:pl:m3:perf')
|
15
|
+
@juz_word = Word.new('już','już','qub')
|
16
|
+
@period = Token.new('.','interp')
|
17
|
+
|
18
|
+
end
|
19
|
+
|
20
|
+
|
21
|
+
|
22
|
+
def test_takipi_remote_lemmatizer
|
23
|
+
text = Lemmatizer.lemmatize(@sample,:takipi,:remote)
|
24
|
+
test_takipi_lemmatizer(text)
|
25
|
+
end
|
26
|
+
|
27
|
+
|
28
|
+
def test_takipi_local_lemmatizer
|
29
|
+
text = Lemmatizer.lemmatize(@sample,:takipi,:local)
|
30
|
+
test_takipi_lemmatizer(text)
|
31
|
+
end
|
32
|
+
|
33
|
+
|
34
|
+
def test_morfeusz_leamtizer
|
35
|
+
text = Lemmatizer.lemmatize(@sample)
|
36
|
+
assert_equal Text, text.class
|
37
|
+
assert_equal 1, text.sentences.size
|
38
|
+
assert_equal 4, text.sentences[0].words_number
|
39
|
+
|
40
|
+
tokens = text.sentences[0].tokens
|
41
|
+
zle,czasy,juz,minely,period = *tokens
|
42
|
+
assert_equal 'zły', zle.lemat
|
43
|
+
assert_equal 'czas', czasy.lemat
|
44
|
+
assert_equal 'już', juz.lemat
|
45
|
+
assert_equal 'minąć', minely.lemat
|
46
|
+
|
47
|
+
end
|
48
|
+
|
49
|
+
private
|
50
|
+
def test_takipi_lemmatizer(text)
|
51
|
+
|
52
|
+
assert_equal Text, text.class
|
53
|
+
assert_equal 1, text.sentences.size
|
54
|
+
assert_equal 4, text.sentences[0].words_number
|
55
|
+
|
56
|
+
tokens = text.sentences[0].tokens
|
57
|
+
zle, czasy, juz, minely, period = *tokens
|
58
|
+
assert_equal @zle_word.inflection, zle.inflection
|
59
|
+
assert_equal @czasy_word.inflection, czasy.inflection
|
60
|
+
assert_equal @juz_word.inflection, juz.inflection
|
61
|
+
assert_equal @minely_word.inflection, minely.inflection
|
62
|
+
assert_equal @period.tags, period.tags
|
63
|
+
|
64
|
+
assert_equal 'zły', zle.lemat
|
65
|
+
assert_equal 'czas', czasy.lemat
|
66
|
+
assert_equal 'już', juz.lemat
|
67
|
+
assert_equal 'minąć', minely.lemat
|
68
|
+
|
69
|
+
end
|
70
|
+
|
71
|
+
|
72
|
+
|
73
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
require 'helper'
|
2
|
+
require '../lib/word.rb'
|
3
|
+
|
4
|
+
class MeaningableTest < Test::Unit::TestCase
|
5
|
+
include NLP
|
6
|
+
|
7
|
+
def setup
|
8
|
+
|
9
|
+
@word_kochamy = Word.new('kochamy','kochać', 'fin:pl:pri:imperf')
|
10
|
+
psych_cat = LIWCCategory.new('PROCESY_PSYCHOLOGICZNE')
|
11
|
+
emotion_cat = LIWCCategory.new('EMOCJE',psych_cat)
|
12
|
+
pos_emotion_cat = LIWCCategory.new('POZYTYWNE_EMOCJE',emotion_cat)
|
13
|
+
@word_kochamy.category = pos_emotion_cat
|
14
|
+
|
15
|
+
end
|
16
|
+
|
17
|
+
def test_category_recognition
|
18
|
+
|
19
|
+
assert @word_kochamy.psychological?
|
20
|
+
assert @word_kochamy.positive_emotion?
|
21
|
+
assert @word_kochamy.emotion?
|
22
|
+
|
23
|
+
assert !@word_kochamy.negative_emotion?
|
24
|
+
end
|
25
|
+
|
26
|
+
|
27
|
+
end
|
28
|
+
|
@@ -0,0 +1,26 @@
|
|
1
|
+
require 'helper'
|
2
|
+
require '../lib/token.rb'
|
3
|
+
|
4
|
+
class SentenceTest < Test::Unit::TestCase
|
5
|
+
|
6
|
+
include NLP
|
7
|
+
|
8
|
+
def setup
|
9
|
+
@sentence = Sentence.new
|
10
|
+
@comma = Token.new(',','interp')
|
11
|
+
@integer = Token.new('32','tnum:integer')
|
12
|
+
@float = Token.new('3,12','tnum:frac')
|
13
|
+
@symbol = Token.new('nie_istniejace_slowo','tsym')
|
14
|
+
end
|
15
|
+
|
16
|
+
|
17
|
+
def test_sentence_size
|
18
|
+
assert_equal 0, @sentence.words_number
|
19
|
+
@sentence << @symbol
|
20
|
+
assert_equal 1, @sentence.words_number
|
21
|
+
@sentence << @integer << @comma << @float
|
22
|
+
assert_equal 3, @sentence.words_number
|
23
|
+
end
|
24
|
+
|
25
|
+
|
26
|
+
end
|
data/test/text_test.rb
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
require 'helper'
|
2
|
+
require '../lib/token.rb'
|
3
|
+
|
4
|
+
class TextTest < Test::Unit::TestCase
|
5
|
+
|
6
|
+
include NLP
|
7
|
+
|
8
|
+
def setup
|
9
|
+
@s1 = Sentence.new
|
10
|
+
@s2 = Sentence.new
|
11
|
+
@comma = Token.new(',','interp')
|
12
|
+
@integer = Token.new('32','tnum:integer')
|
13
|
+
@float = Token.new('3,12','tnum:frac')
|
14
|
+
@symbol = Token.new('nie_istniejace_slowo','tsym')
|
15
|
+
@s1 << @integer << @comma << @symbol
|
16
|
+
@s2 << @integer << @symbol
|
17
|
+
@text = Text.new
|
18
|
+
|
19
|
+
end
|
20
|
+
|
21
|
+
|
22
|
+
def test_text
|
23
|
+
@text << @s1
|
24
|
+
@text << @s2
|
25
|
+
assert_equal 2, @text.words_per_sentence
|
26
|
+
end
|
27
|
+
|
28
|
+
|
29
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
require 'helper'
|
2
|
+
require '../lib/token_scanner.rb'
|
3
|
+
|
4
|
+
class TokenScannerTest < Test::Unit::TestCase
|
5
|
+
|
6
|
+
include NLP
|
7
|
+
|
8
|
+
def setup
|
9
|
+
sentence = "To, jest zdanie."
|
10
|
+
@text = Lemmatizer.lemmatize(sentence,:takipi,:local)
|
11
|
+
@scanner = TokenScanner.new(@text)
|
12
|
+
|
13
|
+
end
|
14
|
+
|
15
|
+
def test_scanner
|
16
|
+
assert_equal "To", @scanner.current.orth
|
17
|
+
@scanner.next(:word)
|
18
|
+
assert_equal "jest", @scanner.current.orth
|
19
|
+
@scanner.next(:interp)
|
20
|
+
assert_equal ".", @scanner.current.orth
|
21
|
+
@scanner.next(:word)
|
22
|
+
assert @scanner.end?
|
23
|
+
@scanner.rewind
|
24
|
+
assert_equal 0, @scanner.index
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
28
|
+
|
data/test/token_test.rb
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
require 'helper'
|
2
|
+
require '../lib/token.rb'
|
3
|
+
|
4
|
+
class TokenTest < Test::Unit::TestCase
|
5
|
+
|
6
|
+
include NLP
|
7
|
+
|
8
|
+
def setup
|
9
|
+
@comma = Token.new(',','interp')
|
10
|
+
@integer = Token.new('32','tnum:integer')
|
11
|
+
@float = Token.new('3,12','tnum:frac')
|
12
|
+
@symbol = Token.new('nie_istniejace_slowo','tsym')
|
13
|
+
end
|
14
|
+
|
15
|
+
|
16
|
+
def test_recognizing_interpunction
|
17
|
+
assert @comma.interp?
|
18
|
+
assert !@comma.word?
|
19
|
+
end
|
20
|
+
|
21
|
+
def test_recognizing_numbers
|
22
|
+
assert @integer.integer?
|
23
|
+
assert !@integer.word?
|
24
|
+
|
25
|
+
assert @float.float?
|
26
|
+
assert @float.number?
|
27
|
+
|
28
|
+
assert !@float.word?
|
29
|
+
assert !@float.integer?
|
30
|
+
end
|
31
|
+
|
32
|
+
def test_symbol
|
33
|
+
assert @symbol.symbol?
|
34
|
+
end
|
35
|
+
|
36
|
+
|
37
|
+
end
|
data/test/word_test.rb
CHANGED
@@ -1,42 +1,45 @@
|
|
1
1
|
require 'helper'
|
2
2
|
require '../lib/word.rb'
|
3
|
+
|
3
4
|
class WordTest < Test::Unit::TestCase
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
5
|
+
include NLP
|
6
|
+
|
7
|
+
def setup
|
8
|
+
@word_kota = Word.new('kota','kot','subst:sg:gen.acc:m2')
|
9
|
+
@word_siebie = Word.new('siebie','się','siebie:gen.acc')
|
10
|
+
end
|
11
|
+
|
12
|
+
def test_word_lematization
|
13
|
+
assert_equal 'kot', @word_kota.lemat
|
14
|
+
assert_equal 'się', @word_siebie.lemat
|
15
|
+
end
|
16
|
+
|
17
|
+
def test_word_orth
|
18
|
+
assert_equal 'kota', @word_kota.orth
|
19
|
+
assert_equal 'siebie', @word_siebie.orth
|
20
|
+
end
|
21
|
+
|
22
|
+
def test_recognizing_part_of_speech
|
23
|
+
assert @word_kota.rzeczownik?
|
24
|
+
assert @word_siebie.zaimek?
|
25
|
+
end
|
26
|
+
|
27
|
+
def test_recognizing_inflection
|
28
|
+
assert @word_kota.liczba_pojedyncza?
|
29
|
+
assert @word_kota.dopelniacz?
|
30
|
+
assert @word_kota.biernik?
|
31
|
+
assert @word_kota.meski_zwierzecy?
|
32
|
+
|
33
|
+
assert !@word_kota.liczba_mnoga?
|
34
|
+
assert !@word_kota.mianownik?
|
35
|
+
|
36
|
+
assert @word_siebie.biernik?
|
37
|
+
assert @word_siebie.dopelniacz?
|
38
|
+
end
|
39
|
+
|
40
|
+
def test_inflection_string
|
41
|
+
assert_equal @word_kota.inflection, 'subst:sg:gen.acc:m2'
|
42
|
+
end
|
40
43
|
|
41
44
|
|
42
45
|
end
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: nlp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 27
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 2
|
9
|
-
-
|
10
|
-
version: 0.2.
|
9
|
+
- 6
|
10
|
+
version: 0.2.6
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- knife
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date:
|
18
|
+
date: 2011-06-21 00:00:00 +02:00
|
19
19
|
default_executable:
|
20
20
|
dependencies: []
|
21
21
|
|
@@ -45,10 +45,10 @@ files:
|
|
45
45
|
- lib/rid_analyzer.rb
|
46
46
|
- lib/rid_category.rb
|
47
47
|
- lib/sentence.rb
|
48
|
+
- lib/statistic.rb
|
48
49
|
- lib/stdlib/ext/array.rb
|
49
50
|
- lib/stdlib/ext/string.rb
|
50
51
|
- lib/stree.rb
|
51
|
-
- lib/takipi_web_service
|
52
52
|
- lib/takipi_web_service.rb
|
53
53
|
- lib/text.rb
|
54
54
|
- lib/token.rb
|
@@ -56,9 +56,17 @@ files:
|
|
56
56
|
- lib/word.rb
|
57
57
|
- LICENSE
|
58
58
|
- README.rdoc
|
59
|
+
- test/sentence_test.rb
|
60
|
+
- test/analyzer_test.rb
|
61
|
+
- test/meaningable_test.rb
|
62
|
+
- test/token_scanner_test.rb
|
59
63
|
- test/helper.rb
|
64
|
+
- test/nlp_test_suite.rb
|
60
65
|
- test/test_nlp.rb
|
61
66
|
- test/word_test.rb
|
67
|
+
- test/lemmatizer_test.rb
|
68
|
+
- test/token_test.rb
|
69
|
+
- test/text_test.rb
|
62
70
|
has_rdoc: true
|
63
71
|
homepage: http://github.com/knife/nlp
|
64
72
|
licenses: []
|
@@ -94,6 +102,14 @@ signing_key:
|
|
94
102
|
specification_version: 3
|
95
103
|
summary: Linguistics tools for processing polish language.
|
96
104
|
test_files:
|
105
|
+
- test/sentence_test.rb
|
106
|
+
- test/analyzer_test.rb
|
107
|
+
- test/meaningable_test.rb
|
108
|
+
- test/token_scanner_test.rb
|
97
109
|
- test/helper.rb
|
110
|
+
- test/nlp_test_suite.rb
|
98
111
|
- test/test_nlp.rb
|
99
112
|
- test/word_test.rb
|
113
|
+
- test/lemmatizer_test.rb
|
114
|
+
- test/token_test.rb
|
115
|
+
- test/text_test.rb
|