nlp 0.2.5 → 0.2.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/analyzer.rb +15 -47
- data/lib/category.rb +7 -8
- data/lib/dictionary.rb +30 -28
- data/lib/emoticon.rb +8 -8
- data/lib/inflectable.rb +58 -59
- data/lib/lemmatizer.rb +86 -82
- data/lib/liwc_analyzer.rb +68 -91
- data/lib/liwc_category.rb +42 -43
- data/lib/meaningable.rb +44 -51
- data/lib/nlp.rb +10 -0
- data/lib/rid_analyzer.rb +5 -69
- data/lib/rid_category.rb +5 -6
- data/lib/sentence.rb +19 -11
- data/lib/statistic.rb +55 -0
- data/lib/stdlib/ext/array.rb +7 -0
- data/lib/stree.rb +39 -39
- data/lib/takipi_web_service.rb +45 -45
- data/lib/text.rb +18 -17
- data/lib/token.rb +28 -25
- data/lib/token_scanner.rb +43 -55
- data/lib/word.rb +14 -14
- data/test/analyzer_test.rb +25 -0
- data/test/lemmatizer_test.rb +73 -0
- data/test/meaningable_test.rb +28 -0
- data/test/nlp_test_suite.rb +11 -0
- data/test/sentence_test.rb +26 -0
- data/test/text_test.rb +29 -0
- data/test/token_scanner_test.rb +28 -0
- data/test/token_test.rb +37 -0
- data/test/word_test.rb +39 -36
- metadata +21 -5
- data/lib/takipi_web_service +0 -0
data/lib/word.rb
CHANGED
@@ -2,22 +2,22 @@ require 'inflectable'
|
|
2
2
|
require 'meaningable'
|
3
3
|
|
4
4
|
module NLP
|
5
|
-
|
6
|
-
|
7
|
-
|
5
|
+
class Word < Token
|
6
|
+
|
7
|
+
include Inflectable
|
8
|
+
include Meaningable
|
8
9
|
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
def initialize(word, lemat, tags)
|
13
|
-
super(word,tags)
|
14
|
-
@lemat = lemat
|
15
|
-
end
|
16
|
-
|
17
|
-
def inflection
|
18
|
-
@tags
|
19
|
-
end
|
10
|
+
attr_reader :lemat
|
11
|
+
attr_accessor :category
|
20
12
|
|
13
|
+
def initialize(word, lemat, tags)
|
14
|
+
super(word,tags)
|
15
|
+
@lemat = lemat
|
16
|
+
end
|
21
17
|
|
18
|
+
def inflection
|
19
|
+
@tags
|
22
20
|
end
|
21
|
+
|
22
|
+
end
|
23
23
|
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
require '../lib/analyzer.rb'
|
2
|
+
|
3
|
+
|
4
|
+
class AnalyzerTest < Test::Unit::TestCase
|
5
|
+
|
6
|
+
include NLP
|
7
|
+
|
8
|
+
def setup
|
9
|
+
sample = "Ja byłam wtedy bardzo szczęśliwa"
|
10
|
+
@text = Lemmatizer.lemmatize(sample,:takipi,:local)
|
11
|
+
@scanner = TokenScanner.new(@text)
|
12
|
+
@rid_analyzer = Analyzer.new(:rid)
|
13
|
+
@liwc_analyzer = Analyzer.new(:liwc)
|
14
|
+
end
|
15
|
+
|
16
|
+
def test_analyze
|
17
|
+
stats = @rid_analyzer.analyze(@scanner)
|
18
|
+
assert_kind_of Statistic, stats
|
19
|
+
assert_equal 5, stats.total_words
|
20
|
+
assert_equal 1, stats.word_count
|
21
|
+
|
22
|
+
end
|
23
|
+
|
24
|
+
end
|
25
|
+
|
@@ -0,0 +1,73 @@
|
|
1
|
+
require 'helper'
|
2
|
+
require '../lib/lemmatizer.rb'
|
3
|
+
|
4
|
+
class LemmatizerTest < Test::Unit::TestCase
|
5
|
+
|
6
|
+
include NLP
|
7
|
+
|
8
|
+
def setup
|
9
|
+
@sample = "Złe czasy już minęły."
|
10
|
+
|
11
|
+
|
12
|
+
@zle_word = Word.new('złe','zły','adj:pl:nom:m3:pos')
|
13
|
+
@czasy_word = Word.new('czasy','czas','subst:pl:nom:m3')
|
14
|
+
@minely_word = Word.new('minęły','minąć','praet:pl:m3:perf')
|
15
|
+
@juz_word = Word.new('już','już','qub')
|
16
|
+
@period = Token.new('.','interp')
|
17
|
+
|
18
|
+
end
|
19
|
+
|
20
|
+
|
21
|
+
|
22
|
+
def test_takipi_remote_lemmatizer
|
23
|
+
text = Lemmatizer.lemmatize(@sample,:takipi,:remote)
|
24
|
+
test_takipi_lemmatizer(text)
|
25
|
+
end
|
26
|
+
|
27
|
+
|
28
|
+
def test_takipi_local_lemmatizer
|
29
|
+
text = Lemmatizer.lemmatize(@sample,:takipi,:local)
|
30
|
+
test_takipi_lemmatizer(text)
|
31
|
+
end
|
32
|
+
|
33
|
+
|
34
|
+
def test_morfeusz_leamtizer
|
35
|
+
text = Lemmatizer.lemmatize(@sample)
|
36
|
+
assert_equal Text, text.class
|
37
|
+
assert_equal 1, text.sentences.size
|
38
|
+
assert_equal 4, text.sentences[0].words_number
|
39
|
+
|
40
|
+
tokens = text.sentences[0].tokens
|
41
|
+
zle,czasy,juz,minely,period = *tokens
|
42
|
+
assert_equal 'zły', zle.lemat
|
43
|
+
assert_equal 'czas', czasy.lemat
|
44
|
+
assert_equal 'już', juz.lemat
|
45
|
+
assert_equal 'minąć', minely.lemat
|
46
|
+
|
47
|
+
end
|
48
|
+
|
49
|
+
private
|
50
|
+
def test_takipi_lemmatizer(text)
|
51
|
+
|
52
|
+
assert_equal Text, text.class
|
53
|
+
assert_equal 1, text.sentences.size
|
54
|
+
assert_equal 4, text.sentences[0].words_number
|
55
|
+
|
56
|
+
tokens = text.sentences[0].tokens
|
57
|
+
zle, czasy, juz, minely, period = *tokens
|
58
|
+
assert_equal @zle_word.inflection, zle.inflection
|
59
|
+
assert_equal @czasy_word.inflection, czasy.inflection
|
60
|
+
assert_equal @juz_word.inflection, juz.inflection
|
61
|
+
assert_equal @minely_word.inflection, minely.inflection
|
62
|
+
assert_equal @period.tags, period.tags
|
63
|
+
|
64
|
+
assert_equal 'zły', zle.lemat
|
65
|
+
assert_equal 'czas', czasy.lemat
|
66
|
+
assert_equal 'już', juz.lemat
|
67
|
+
assert_equal 'minąć', minely.lemat
|
68
|
+
|
69
|
+
end
|
70
|
+
|
71
|
+
|
72
|
+
|
73
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
require 'helper'
|
2
|
+
require '../lib/word.rb'
|
3
|
+
|
4
|
+
class MeaningableTest < Test::Unit::TestCase
|
5
|
+
include NLP
|
6
|
+
|
7
|
+
def setup
|
8
|
+
|
9
|
+
@word_kochamy = Word.new('kochamy','kochać', 'fin:pl:pri:imperf')
|
10
|
+
psych_cat = LIWCCategory.new('PROCESY_PSYCHOLOGICZNE')
|
11
|
+
emotion_cat = LIWCCategory.new('EMOCJE',psych_cat)
|
12
|
+
pos_emotion_cat = LIWCCategory.new('POZYTYWNE_EMOCJE',emotion_cat)
|
13
|
+
@word_kochamy.category = pos_emotion_cat
|
14
|
+
|
15
|
+
end
|
16
|
+
|
17
|
+
def test_category_recognition
|
18
|
+
|
19
|
+
assert @word_kochamy.psychological?
|
20
|
+
assert @word_kochamy.positive_emotion?
|
21
|
+
assert @word_kochamy.emotion?
|
22
|
+
|
23
|
+
assert !@word_kochamy.negative_emotion?
|
24
|
+
end
|
25
|
+
|
26
|
+
|
27
|
+
end
|
28
|
+
|
@@ -0,0 +1,26 @@
|
|
1
|
+
require 'helper'
|
2
|
+
require '../lib/token.rb'
|
3
|
+
|
4
|
+
class SentenceTest < Test::Unit::TestCase
|
5
|
+
|
6
|
+
include NLP
|
7
|
+
|
8
|
+
def setup
|
9
|
+
@sentence = Sentence.new
|
10
|
+
@comma = Token.new(',','interp')
|
11
|
+
@integer = Token.new('32','tnum:integer')
|
12
|
+
@float = Token.new('3,12','tnum:frac')
|
13
|
+
@symbol = Token.new('nie_istniejace_slowo','tsym')
|
14
|
+
end
|
15
|
+
|
16
|
+
|
17
|
+
def test_sentence_size
|
18
|
+
assert_equal 0, @sentence.words_number
|
19
|
+
@sentence << @symbol
|
20
|
+
assert_equal 1, @sentence.words_number
|
21
|
+
@sentence << @integer << @comma << @float
|
22
|
+
assert_equal 3, @sentence.words_number
|
23
|
+
end
|
24
|
+
|
25
|
+
|
26
|
+
end
|
data/test/text_test.rb
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
require 'helper'
|
2
|
+
require '../lib/token.rb'
|
3
|
+
|
4
|
+
class TextTest < Test::Unit::TestCase
|
5
|
+
|
6
|
+
include NLP
|
7
|
+
|
8
|
+
def setup
|
9
|
+
@s1 = Sentence.new
|
10
|
+
@s2 = Sentence.new
|
11
|
+
@comma = Token.new(',','interp')
|
12
|
+
@integer = Token.new('32','tnum:integer')
|
13
|
+
@float = Token.new('3,12','tnum:frac')
|
14
|
+
@symbol = Token.new('nie_istniejace_slowo','tsym')
|
15
|
+
@s1 << @integer << @comma << @symbol
|
16
|
+
@s2 << @integer << @symbol
|
17
|
+
@text = Text.new
|
18
|
+
|
19
|
+
end
|
20
|
+
|
21
|
+
|
22
|
+
def test_text
|
23
|
+
@text << @s1
|
24
|
+
@text << @s2
|
25
|
+
assert_equal 2, @text.words_per_sentence
|
26
|
+
end
|
27
|
+
|
28
|
+
|
29
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
require 'helper'
|
2
|
+
require '../lib/token_scanner.rb'
|
3
|
+
|
4
|
+
class TokenScannerTest < Test::Unit::TestCase
|
5
|
+
|
6
|
+
include NLP
|
7
|
+
|
8
|
+
def setup
|
9
|
+
sentence = "To, jest zdanie."
|
10
|
+
@text = Lemmatizer.lemmatize(sentence,:takipi,:local)
|
11
|
+
@scanner = TokenScanner.new(@text)
|
12
|
+
|
13
|
+
end
|
14
|
+
|
15
|
+
def test_scanner
|
16
|
+
assert_equal "To", @scanner.current.orth
|
17
|
+
@scanner.next(:word)
|
18
|
+
assert_equal "jest", @scanner.current.orth
|
19
|
+
@scanner.next(:interp)
|
20
|
+
assert_equal ".", @scanner.current.orth
|
21
|
+
@scanner.next(:word)
|
22
|
+
assert @scanner.end?
|
23
|
+
@scanner.rewind
|
24
|
+
assert_equal 0, @scanner.index
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
28
|
+
|
data/test/token_test.rb
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
require 'helper'
|
2
|
+
require '../lib/token.rb'
|
3
|
+
|
4
|
+
class TokenTest < Test::Unit::TestCase
|
5
|
+
|
6
|
+
include NLP
|
7
|
+
|
8
|
+
def setup
|
9
|
+
@comma = Token.new(',','interp')
|
10
|
+
@integer = Token.new('32','tnum:integer')
|
11
|
+
@float = Token.new('3,12','tnum:frac')
|
12
|
+
@symbol = Token.new('nie_istniejace_slowo','tsym')
|
13
|
+
end
|
14
|
+
|
15
|
+
|
16
|
+
def test_recognizing_interpunction
|
17
|
+
assert @comma.interp?
|
18
|
+
assert !@comma.word?
|
19
|
+
end
|
20
|
+
|
21
|
+
def test_recognizing_numbers
|
22
|
+
assert @integer.integer?
|
23
|
+
assert !@integer.word?
|
24
|
+
|
25
|
+
assert @float.float?
|
26
|
+
assert @float.number?
|
27
|
+
|
28
|
+
assert !@float.word?
|
29
|
+
assert !@float.integer?
|
30
|
+
end
|
31
|
+
|
32
|
+
def test_symbol
|
33
|
+
assert @symbol.symbol?
|
34
|
+
end
|
35
|
+
|
36
|
+
|
37
|
+
end
|
data/test/word_test.rb
CHANGED
@@ -1,42 +1,45 @@
|
|
1
1
|
require 'helper'
|
2
2
|
require '../lib/word.rb'
|
3
|
+
|
3
4
|
class WordTest < Test::Unit::TestCase
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
5
|
+
include NLP
|
6
|
+
|
7
|
+
def setup
|
8
|
+
@word_kota = Word.new('kota','kot','subst:sg:gen.acc:m2')
|
9
|
+
@word_siebie = Word.new('siebie','się','siebie:gen.acc')
|
10
|
+
end
|
11
|
+
|
12
|
+
def test_word_lematization
|
13
|
+
assert_equal 'kot', @word_kota.lemat
|
14
|
+
assert_equal 'się', @word_siebie.lemat
|
15
|
+
end
|
16
|
+
|
17
|
+
def test_word_orth
|
18
|
+
assert_equal 'kota', @word_kota.orth
|
19
|
+
assert_equal 'siebie', @word_siebie.orth
|
20
|
+
end
|
21
|
+
|
22
|
+
def test_recognizing_part_of_speech
|
23
|
+
assert @word_kota.rzeczownik?
|
24
|
+
assert @word_siebie.zaimek?
|
25
|
+
end
|
26
|
+
|
27
|
+
def test_recognizing_inflection
|
28
|
+
assert @word_kota.liczba_pojedyncza?
|
29
|
+
assert @word_kota.dopelniacz?
|
30
|
+
assert @word_kota.biernik?
|
31
|
+
assert @word_kota.meski_zwierzecy?
|
32
|
+
|
33
|
+
assert !@word_kota.liczba_mnoga?
|
34
|
+
assert !@word_kota.mianownik?
|
35
|
+
|
36
|
+
assert @word_siebie.biernik?
|
37
|
+
assert @word_siebie.dopelniacz?
|
38
|
+
end
|
39
|
+
|
40
|
+
def test_inflection_string
|
41
|
+
assert_equal @word_kota.inflection, 'subst:sg:gen.acc:m2'
|
42
|
+
end
|
40
43
|
|
41
44
|
|
42
45
|
end
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: nlp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 27
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 2
|
9
|
-
-
|
10
|
-
version: 0.2.
|
9
|
+
- 6
|
10
|
+
version: 0.2.6
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- knife
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date:
|
18
|
+
date: 2011-06-21 00:00:00 +02:00
|
19
19
|
default_executable:
|
20
20
|
dependencies: []
|
21
21
|
|
@@ -45,10 +45,10 @@ files:
|
|
45
45
|
- lib/rid_analyzer.rb
|
46
46
|
- lib/rid_category.rb
|
47
47
|
- lib/sentence.rb
|
48
|
+
- lib/statistic.rb
|
48
49
|
- lib/stdlib/ext/array.rb
|
49
50
|
- lib/stdlib/ext/string.rb
|
50
51
|
- lib/stree.rb
|
51
|
-
- lib/takipi_web_service
|
52
52
|
- lib/takipi_web_service.rb
|
53
53
|
- lib/text.rb
|
54
54
|
- lib/token.rb
|
@@ -56,9 +56,17 @@ files:
|
|
56
56
|
- lib/word.rb
|
57
57
|
- LICENSE
|
58
58
|
- README.rdoc
|
59
|
+
- test/sentence_test.rb
|
60
|
+
- test/analyzer_test.rb
|
61
|
+
- test/meaningable_test.rb
|
62
|
+
- test/token_scanner_test.rb
|
59
63
|
- test/helper.rb
|
64
|
+
- test/nlp_test_suite.rb
|
60
65
|
- test/test_nlp.rb
|
61
66
|
- test/word_test.rb
|
67
|
+
- test/lemmatizer_test.rb
|
68
|
+
- test/token_test.rb
|
69
|
+
- test/text_test.rb
|
62
70
|
has_rdoc: true
|
63
71
|
homepage: http://github.com/knife/nlp
|
64
72
|
licenses: []
|
@@ -94,6 +102,14 @@ signing_key:
|
|
94
102
|
specification_version: 3
|
95
103
|
summary: Linguistics tools for processing polish language.
|
96
104
|
test_files:
|
105
|
+
- test/sentence_test.rb
|
106
|
+
- test/analyzer_test.rb
|
107
|
+
- test/meaningable_test.rb
|
108
|
+
- test/token_scanner_test.rb
|
97
109
|
- test/helper.rb
|
110
|
+
- test/nlp_test_suite.rb
|
98
111
|
- test/test_nlp.rb
|
99
112
|
- test/word_test.rb
|
113
|
+
- test/lemmatizer_test.rb
|
114
|
+
- test/token_test.rb
|
115
|
+
- test/text_test.rb
|