nlp 0.2.1 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/analyzer.rb +1 -5
- data/lib/dictionary.rb +0 -1
- data/lib/emoticon.rb +3 -2
- data/lib/inflectable.rb +20 -18
- data/lib/sentence.rb +8 -10
- data/lib/token.rb +25 -26
- data/lib/token_scanner.rb +104 -105
- data/lib/word.rb +7 -7
- data/test/word_test.rb +42 -0
- metadata +6 -4
data/lib/analyzer.rb
CHANGED
@@ -1,12 +1,10 @@
|
|
1
1
|
require 'dictionary'
|
2
|
-
require 'morfeusz'
|
2
|
+
#require 'morfeusz'
|
3
3
|
require 'token'
|
4
4
|
require 'word'
|
5
5
|
require 'emoticon'
|
6
6
|
require 'sentence'
|
7
7
|
require "token_scanner.rb"
|
8
|
-
require "inflectable"
|
9
|
-
require "meaningable"
|
10
8
|
|
11
9
|
$KODE = "UTF8"
|
12
10
|
|
@@ -15,7 +13,6 @@ module NLP
|
|
15
13
|
class Analyzer
|
16
14
|
|
17
15
|
include REXML
|
18
|
-
#Lexeme = Apohllo::Morfeusz::Lexeme
|
19
16
|
|
20
17
|
def initialize( category_file, restore = true )
|
21
18
|
state_file = File.expand_path(Dictionary::CACHE_DIR)
|
@@ -46,7 +43,6 @@ module NLP
|
|
46
43
|
unless categories.nil?
|
47
44
|
categories.each do |category|
|
48
45
|
|
49
|
-
puts "#{word} : #{category.name}"
|
50
46
|
results[:scores][category] = results[:scores][category] + 1
|
51
47
|
end
|
52
48
|
|
data/lib/dictionary.rb
CHANGED
data/lib/emoticon.rb
CHANGED
data/lib/inflectable.rb
CHANGED
@@ -2,8 +2,8 @@ module Inflectable
|
|
2
2
|
|
3
3
|
GRAM_CAT = {
|
4
4
|
#rzeczownik
|
5
|
-
[:subst, :depr] => 'rzeczownik',
|
6
5
|
:adj => 'przymiotnik',
|
6
|
+
[:subst,:depr] => 'rzeczownik',
|
7
7
|
:adv => 'przyslowek',
|
8
8
|
:num => 'liczebnik',
|
9
9
|
[:pron,:siebie] => 'zaimek',
|
@@ -22,15 +22,15 @@ module Inflectable
|
|
22
22
|
:voc => 'wolacz',
|
23
23
|
|
24
24
|
#Rodzaje
|
25
|
-
:m1 => '
|
26
|
-
:m2 => '
|
27
|
-
:m3 => '
|
28
|
-
:f => '
|
29
|
-
:n1 => '
|
30
|
-
:n2 => 'nijaki
|
31
|
-
:p1 => '
|
32
|
-
:p2 => '
|
33
|
-
:p3 => '
|
25
|
+
:m1 => 'meski_osobowy',
|
26
|
+
:m2 => 'meski_zwierzecy',
|
27
|
+
:m3 => 'meski_rzeczowy',
|
28
|
+
:f => 'zenski',
|
29
|
+
:n1 => 'nijaki_zbiorowy',
|
30
|
+
:n2 => 'nijaki zwykly',
|
31
|
+
:p1 => 'przymnogi_osobowy',
|
32
|
+
:p2 => 'przymnogi_zwykly',
|
33
|
+
:p3 => 'przymnogi_opisowy',
|
34
34
|
|
35
35
|
#Osoby
|
36
36
|
:pri => "pierwsza_osoba",
|
@@ -44,16 +44,18 @@ module Inflectable
|
|
44
44
|
}
|
45
45
|
|
46
46
|
GRAM_CAT.each do |key,value|
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
47
|
+
|
48
|
+
define_method(value+"?"){
|
49
|
+
inflection.split(":").any?{|e|
|
50
|
+
if key.is_a? Array
|
51
|
+
key.any?{|k| e.include? k.to_s}
|
52
|
+
else
|
53
|
+
e.include? key.to_s
|
54
|
+
end
|
55
|
+
}
|
56
|
+
}
|
54
57
|
end
|
55
58
|
|
56
59
|
|
57
|
-
|
58
60
|
|
59
61
|
end
|
data/lib/sentence.rb
CHANGED
@@ -1,14 +1,12 @@
|
|
1
1
|
module NLP
|
2
|
-
class Sentence
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
2
|
+
class Sentence
|
3
|
+
attr_reader :tokens
|
4
|
+
def initialize()
|
5
|
+
@tokens = []
|
6
|
+
end
|
7
7
|
|
8
|
-
|
9
|
-
|
8
|
+
def << tokens
|
9
|
+
@tokens.concat tokens
|
10
|
+
end
|
10
11
|
end
|
11
|
-
|
12
|
-
|
13
|
-
end
|
14
12
|
end
|
data/lib/token.rb
CHANGED
@@ -1,35 +1,34 @@
|
|
1
|
-
|
1
|
+
|
2
2
|
module NLP
|
3
|
-
class Token
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
def initialize(orth,tags)
|
9
|
-
@orth = orth
|
10
|
-
@tags = tags
|
11
|
-
end
|
3
|
+
class Token
|
4
|
+
attr_reader :orth
|
5
|
+
attr_reader :tags
|
12
6
|
|
13
|
-
|
14
|
-
|
15
|
-
|
7
|
+
|
8
|
+
def initialize(orth,tags)
|
9
|
+
@orth = orth
|
10
|
+
@tags = tags
|
11
|
+
end
|
16
12
|
|
17
|
-
|
18
|
-
|
19
|
-
|
13
|
+
def interp?
|
14
|
+
@tags.eql? "interp"
|
15
|
+
end
|
20
16
|
|
21
|
-
|
22
|
-
|
23
|
-
|
17
|
+
def word?
|
18
|
+
not interp? and not number?
|
19
|
+
end
|
24
20
|
|
25
|
-
|
26
|
-
|
27
|
-
|
21
|
+
def number?
|
22
|
+
@tags.include?("tnum")
|
23
|
+
end
|
28
24
|
|
29
|
-
|
30
|
-
|
31
|
-
|
25
|
+
def integer?
|
26
|
+
@tags.include?("tnum:integer")
|
27
|
+
end
|
32
28
|
|
29
|
+
def float?
|
30
|
+
@tags.include?("tnum:frac")
|
31
|
+
end
|
33
32
|
|
34
|
-
end
|
33
|
+
end
|
35
34
|
end
|
data/lib/token_scanner.rb
CHANGED
@@ -1,137 +1,136 @@
|
|
1
1
|
|
2
2
|
require 'rexml/document'
|
3
3
|
require 'soap/rpc/driver'
|
4
|
-
module NLP
|
5
|
-
class TokenScanner
|
6
|
-
include REXML
|
7
|
-
attr_reader :text, :tokens
|
8
4
|
|
9
|
-
|
10
|
-
|
5
|
+
module NLP
|
6
|
+
class TokenScanner
|
7
|
+
include REXML
|
11
8
|
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
9
|
+
attr_reader :text, :tokens
|
10
|
+
|
11
|
+
def initialize(text, method)
|
12
|
+
|
13
|
+
if method === :takipi
|
14
|
+
@text = load_lemated_text(text)
|
15
|
+
elsif method === :morfeusz
|
16
|
+
@text = lematize_text(text)
|
17
|
+
else
|
18
|
+
@text = text
|
19
|
+
end
|
20
|
+
|
21
|
+
@pos = 0
|
22
|
+
@tokens = flatten_text(@text)
|
19
23
|
end
|
20
24
|
|
21
|
-
|
22
|
-
|
25
|
+
def next(type)
|
26
|
+
@pos+=1
|
23
27
|
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
@pos+= 1
|
30
|
-
end
|
28
|
+
case type
|
29
|
+
when :word
|
30
|
+
while @pos < @tokens.size and !@tokens[@pos].word?
|
31
|
+
@pos+= 1
|
32
|
+
end
|
31
33
|
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
34
|
+
when :interp
|
35
|
+
while @pos < @tokens.size and !@tokens[@pos].interp?
|
36
|
+
@pos+= 1
|
37
|
+
end
|
38
|
+
|
39
|
+
when :number
|
40
|
+
while @pos < @tokens.size and !@tokens[@pos].number?
|
41
|
+
@pos+= 1
|
42
|
+
end
|
40
43
|
end
|
41
|
-
|
42
44
|
end
|
43
|
-
end
|
44
45
|
|
45
|
-
|
46
|
-
|
47
|
-
|
46
|
+
def current
|
47
|
+
|
48
|
+
if @pos == @tokens.size
|
48
49
|
nil
|
49
|
-
|
50
|
+
else
|
50
51
|
@tokens[@pos]
|
51
|
-
|
52
|
-
|
53
|
-
end
|
54
|
-
|
55
|
-
def index
|
56
|
-
@pos
|
57
|
-
end
|
58
|
-
|
59
|
-
def end?
|
60
|
-
@pos == tokens.size
|
61
|
-
end
|
62
|
-
|
63
|
-
|
64
|
-
private
|
65
|
-
|
66
|
-
def flatten_text(text)
|
67
|
-
flattened = []
|
68
|
-
text.each { |s| s.tokens.each {|t| flattened.push t } }
|
69
|
-
flattened
|
70
|
-
end
|
71
|
-
|
72
|
-
def load_lemated_text(text_file)
|
73
|
-
|
74
|
-
t1 = Thread.new do
|
75
|
-
`takipi -i #{text_file} -o output.xml -it TXT`
|
76
|
-
end
|
77
|
-
t1.join
|
52
|
+
end
|
78
53
|
|
54
|
+
end
|
79
55
|
|
56
|
+
def index
|
57
|
+
@pos
|
58
|
+
end
|
80
59
|
|
81
|
-
|
60
|
+
def end?
|
61
|
+
@pos == tokens.size
|
62
|
+
end
|
63
|
+
|
82
64
|
|
83
|
-
|
84
|
-
File.open("output.xml") do |f|
|
85
|
-
doc = Document.new(f)
|
65
|
+
private
|
86
66
|
|
87
|
-
|
88
|
-
|
89
|
-
|
67
|
+
def flatten_text(text)
|
68
|
+
flattened = []
|
69
|
+
text.each { |s| s.tokens.each {|t| flattened.push t } }
|
70
|
+
flattened
|
71
|
+
end
|
90
72
|
|
91
|
-
|
92
|
-
|
93
|
-
|
73
|
+
#Tok
|
74
|
+
|
75
|
+
def load_lemated_text(text)
|
94
76
|
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
77
|
+
t1 = Thread.new do
|
78
|
+
`echo #{text} | takipi -i -o output.xml -it TXT`
|
79
|
+
end
|
80
|
+
t1.join
|
81
|
+
|
82
|
+
text = []
|
83
|
+
File.open("output.xml") do |f|
|
84
|
+
doc = Document.new(f)
|
85
|
+
|
86
|
+
doc.elements.each("*/chunkList/chunk") do |chunk|
|
87
|
+
sentence = Sentence.new
|
88
|
+
tokens = []
|
89
|
+
|
90
|
+
chunk.elements.each("tok") do |tok|
|
91
|
+
word = tok.elements[1].text
|
92
|
+
lemat, inflect = ""
|
93
|
+
|
94
|
+
tok.elements.each("lex") do |lex|
|
95
|
+
if lex.has_attributes?
|
96
|
+
lemat = lex.elements[1].text
|
97
|
+
inflect = lex.elements[2].text
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
tokens << Word.new(word,lemat,inflect)
|
100
102
|
end
|
101
|
-
|
102
|
-
tokens << Word.new(word,lemat,inflect)
|
103
|
-
end
|
104
103
|
|
105
|
-
|
106
|
-
|
104
|
+
sentence << tokens
|
105
|
+
text << sentence
|
106
|
+
end
|
107
107
|
end
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
108
|
+
text
|
109
|
+
end
|
110
|
+
|
111
|
+
def lematize_text(text)
|
112
|
+
temp_text = []
|
113
|
+
text.split(/\.|!|\?/).each do |s|
|
114
|
+
sentence = Sentence.new
|
115
|
+
sentence << s.split(" ").collect{ |t|
|
116
|
+
if word = Morfeusz::Lexeme.find(t)
|
117
|
+
if word[0]
|
118
|
+
Word.new(t,word[0].base_form,"")
|
119
|
+
else
|
120
|
+
Word.new(t,"","")
|
121
|
+
end
|
122
|
+
else
|
121
123
|
Word.new(t,"","")
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
temp_text.push sentence
|
124
|
+
end
|
125
|
+
}
|
126
|
+
temp_text.push sentence
|
127
|
+
end
|
128
|
+
temp_text
|
128
129
|
end
|
129
|
-
temp_text
|
130
|
-
end
|
131
130
|
|
132
131
|
|
133
132
|
|
134
133
|
|
135
|
-
end
|
134
|
+
end
|
136
135
|
|
137
136
|
end
|
data/lib/word.rb
CHANGED
@@ -2,20 +2,20 @@ require 'inflectable'
|
|
2
2
|
require 'meaningable'
|
3
3
|
|
4
4
|
module NLP
|
5
|
-
class Word < Token
|
6
|
-
|
7
|
-
|
5
|
+
class Word < Token
|
6
|
+
include Inflectable
|
7
|
+
include Meaningable
|
8
8
|
|
9
9
|
attr_reader :lemat, :orth
|
10
|
-
|
11
|
-
|
10
|
+
|
11
|
+
def initialize(word, lemat, tags)
|
12
12
|
super(word,tags)
|
13
13
|
@lemat = lemat
|
14
|
-
|
14
|
+
end
|
15
15
|
|
16
16
|
def inflection
|
17
17
|
@tags
|
18
18
|
end
|
19
19
|
|
20
|
-
end
|
20
|
+
end
|
21
21
|
end
|
data/test/word_test.rb
ADDED
@@ -0,0 +1,42 @@
|
|
1
|
+
require 'helper'
|
2
|
+
require '../lib/word.rb'
|
3
|
+
class WordTest < Test::Unit::TestCase
|
4
|
+
def setup
|
5
|
+
@word_kota = NLP::Word.new('kota','kot','subst:sg:gen.acc:m2')
|
6
|
+
@word_siebie = NLP::Word.new('siebie','się','siebie:gen.acc')
|
7
|
+
end
|
8
|
+
|
9
|
+
def test_word_lematization
|
10
|
+
assert_equal 'kot', @word_kota.lemat
|
11
|
+
assert_equal 'się', @word_siebie.lemat
|
12
|
+
end
|
13
|
+
|
14
|
+
def test_word_orth
|
15
|
+
assert_equal 'kota', @word_kota.orth
|
16
|
+
assert_equal 'siebie', @word_siebie.orth
|
17
|
+
end
|
18
|
+
|
19
|
+
def test_recognizing_part_of_speech
|
20
|
+
assert @word_kota.rzeczownik?
|
21
|
+
assert @word_siebie.zaimek?
|
22
|
+
end
|
23
|
+
|
24
|
+
def test_recognizing_inflection
|
25
|
+
assert @word_kota.liczba_pojedyncza?
|
26
|
+
assert @word_kota.dopelniacz?
|
27
|
+
assert @word_kota.biernik?
|
28
|
+
assert @word_kota.meski_zwierzecy?
|
29
|
+
|
30
|
+
assert_equal false, @word_kota.liczba_mnoga?
|
31
|
+
assert_equal false, @word_kota.mianownik?
|
32
|
+
|
33
|
+
assert @word_siebie.biernik?
|
34
|
+
assert @word_siebie.dopelniacz?
|
35
|
+
end
|
36
|
+
|
37
|
+
def test_inflection_string
|
38
|
+
assert_equal @word_kota.inflection, 'subst:sg:gen.acc:m2'
|
39
|
+
end
|
40
|
+
|
41
|
+
|
42
|
+
end
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: nlp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 19
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 2
|
9
|
-
-
|
10
|
-
version: 0.2.
|
9
|
+
- 2
|
10
|
+
version: 0.2.2
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- knife
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2010-09-
|
18
|
+
date: 2010-09-06 00:00:00 +02:00
|
19
19
|
default_executable:
|
20
20
|
dependencies: []
|
21
21
|
|
@@ -51,6 +51,7 @@ files:
|
|
51
51
|
- README.rdoc
|
52
52
|
- test/helper.rb
|
53
53
|
- test/test_nlp.rb
|
54
|
+
- test/word_test.rb
|
54
55
|
has_rdoc: true
|
55
56
|
homepage: http://github.com/knife/nlp
|
56
57
|
licenses: []
|
@@ -88,3 +89,4 @@ summary: Linguistics tools for processing polish language.
|
|
88
89
|
test_files:
|
89
90
|
- test/helper.rb
|
90
91
|
- test/test_nlp.rb
|
92
|
+
- test/word_test.rb
|