nlp 0.2.1 → 0.2.2
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/analyzer.rb +1 -5
- data/lib/dictionary.rb +0 -1
- data/lib/emoticon.rb +3 -2
- data/lib/inflectable.rb +20 -18
- data/lib/sentence.rb +8 -10
- data/lib/token.rb +25 -26
- data/lib/token_scanner.rb +104 -105
- data/lib/word.rb +7 -7
- data/test/word_test.rb +42 -0
- metadata +6 -4
data/lib/analyzer.rb
CHANGED
@@ -1,12 +1,10 @@
|
|
1
1
|
require 'dictionary'
|
2
|
-
require 'morfeusz'
|
2
|
+
#require 'morfeusz'
|
3
3
|
require 'token'
|
4
4
|
require 'word'
|
5
5
|
require 'emoticon'
|
6
6
|
require 'sentence'
|
7
7
|
require "token_scanner.rb"
|
8
|
-
require "inflectable"
|
9
|
-
require "meaningable"
|
10
8
|
|
11
9
|
$KODE = "UTF8"
|
12
10
|
|
@@ -15,7 +13,6 @@ module NLP
|
|
15
13
|
class Analyzer
|
16
14
|
|
17
15
|
include REXML
|
18
|
-
#Lexeme = Apohllo::Morfeusz::Lexeme
|
19
16
|
|
20
17
|
def initialize( category_file, restore = true )
|
21
18
|
state_file = File.expand_path(Dictionary::CACHE_DIR)
|
@@ -46,7 +43,6 @@ module NLP
|
|
46
43
|
unless categories.nil?
|
47
44
|
categories.each do |category|
|
48
45
|
|
49
|
-
puts "#{word} : #{category.name}"
|
50
46
|
results[:scores][category] = results[:scores][category] + 1
|
51
47
|
end
|
52
48
|
|
data/lib/dictionary.rb
CHANGED
data/lib/emoticon.rb
CHANGED
data/lib/inflectable.rb
CHANGED
@@ -2,8 +2,8 @@ module Inflectable
|
|
2
2
|
|
3
3
|
GRAM_CAT = {
|
4
4
|
#rzeczownik
|
5
|
-
[:subst, :depr] => 'rzeczownik',
|
6
5
|
:adj => 'przymiotnik',
|
6
|
+
[:subst,:depr] => 'rzeczownik',
|
7
7
|
:adv => 'przyslowek',
|
8
8
|
:num => 'liczebnik',
|
9
9
|
[:pron,:siebie] => 'zaimek',
|
@@ -22,15 +22,15 @@ module Inflectable
|
|
22
22
|
:voc => 'wolacz',
|
23
23
|
|
24
24
|
#Rodzaje
|
25
|
-
:m1 => '
|
26
|
-
:m2 => '
|
27
|
-
:m3 => '
|
28
|
-
:f => '
|
29
|
-
:n1 => '
|
30
|
-
:n2 => 'nijaki
|
31
|
-
:p1 => '
|
32
|
-
:p2 => '
|
33
|
-
:p3 => '
|
25
|
+
:m1 => 'meski_osobowy',
|
26
|
+
:m2 => 'meski_zwierzecy',
|
27
|
+
:m3 => 'meski_rzeczowy',
|
28
|
+
:f => 'zenski',
|
29
|
+
:n1 => 'nijaki_zbiorowy',
|
30
|
+
:n2 => 'nijaki zwykly',
|
31
|
+
:p1 => 'przymnogi_osobowy',
|
32
|
+
:p2 => 'przymnogi_zwykly',
|
33
|
+
:p3 => 'przymnogi_opisowy',
|
34
34
|
|
35
35
|
#Osoby
|
36
36
|
:pri => "pierwsza_osoba",
|
@@ -44,16 +44,18 @@ module Inflectable
|
|
44
44
|
}
|
45
45
|
|
46
46
|
GRAM_CAT.each do |key,value|
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
47
|
+
|
48
|
+
define_method(value+"?"){
|
49
|
+
inflection.split(":").any?{|e|
|
50
|
+
if key.is_a? Array
|
51
|
+
key.any?{|k| e.include? k.to_s}
|
52
|
+
else
|
53
|
+
e.include? key.to_s
|
54
|
+
end
|
55
|
+
}
|
56
|
+
}
|
54
57
|
end
|
55
58
|
|
56
59
|
|
57
|
-
|
58
60
|
|
59
61
|
end
|
data/lib/sentence.rb
CHANGED
@@ -1,14 +1,12 @@
|
|
1
1
|
module NLP
|
2
|
-
class Sentence
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
2
|
+
class Sentence
|
3
|
+
attr_reader :tokens
|
4
|
+
def initialize()
|
5
|
+
@tokens = []
|
6
|
+
end
|
7
7
|
|
8
|
-
|
9
|
-
|
8
|
+
def << tokens
|
9
|
+
@tokens.concat tokens
|
10
|
+
end
|
10
11
|
end
|
11
|
-
|
12
|
-
|
13
|
-
end
|
14
12
|
end
|
data/lib/token.rb
CHANGED
@@ -1,35 +1,34 @@
|
|
1
|
-
|
1
|
+
|
2
2
|
module NLP
|
3
|
-
class Token
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
def initialize(orth,tags)
|
9
|
-
@orth = orth
|
10
|
-
@tags = tags
|
11
|
-
end
|
3
|
+
class Token
|
4
|
+
attr_reader :orth
|
5
|
+
attr_reader :tags
|
12
6
|
|
13
|
-
|
14
|
-
|
15
|
-
|
7
|
+
|
8
|
+
def initialize(orth,tags)
|
9
|
+
@orth = orth
|
10
|
+
@tags = tags
|
11
|
+
end
|
16
12
|
|
17
|
-
|
18
|
-
|
19
|
-
|
13
|
+
def interp?
|
14
|
+
@tags.eql? "interp"
|
15
|
+
end
|
20
16
|
|
21
|
-
|
22
|
-
|
23
|
-
|
17
|
+
def word?
|
18
|
+
not interp? and not number?
|
19
|
+
end
|
24
20
|
|
25
|
-
|
26
|
-
|
27
|
-
|
21
|
+
def number?
|
22
|
+
@tags.include?("tnum")
|
23
|
+
end
|
28
24
|
|
29
|
-
|
30
|
-
|
31
|
-
|
25
|
+
def integer?
|
26
|
+
@tags.include?("tnum:integer")
|
27
|
+
end
|
32
28
|
|
29
|
+
def float?
|
30
|
+
@tags.include?("tnum:frac")
|
31
|
+
end
|
33
32
|
|
34
|
-
end
|
33
|
+
end
|
35
34
|
end
|
data/lib/token_scanner.rb
CHANGED
@@ -1,137 +1,136 @@
|
|
1
1
|
|
2
2
|
require 'rexml/document'
|
3
3
|
require 'soap/rpc/driver'
|
4
|
-
module NLP
|
5
|
-
class TokenScanner
|
6
|
-
include REXML
|
7
|
-
attr_reader :text, :tokens
|
8
4
|
|
9
|
-
|
10
|
-
|
5
|
+
module NLP
|
6
|
+
class TokenScanner
|
7
|
+
include REXML
|
11
8
|
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
9
|
+
attr_reader :text, :tokens
|
10
|
+
|
11
|
+
def initialize(text, method)
|
12
|
+
|
13
|
+
if method === :takipi
|
14
|
+
@text = load_lemated_text(text)
|
15
|
+
elsif method === :morfeusz
|
16
|
+
@text = lematize_text(text)
|
17
|
+
else
|
18
|
+
@text = text
|
19
|
+
end
|
20
|
+
|
21
|
+
@pos = 0
|
22
|
+
@tokens = flatten_text(@text)
|
19
23
|
end
|
20
24
|
|
21
|
-
|
22
|
-
|
25
|
+
def next(type)
|
26
|
+
@pos+=1
|
23
27
|
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
@pos+= 1
|
30
|
-
end
|
28
|
+
case type
|
29
|
+
when :word
|
30
|
+
while @pos < @tokens.size and !@tokens[@pos].word?
|
31
|
+
@pos+= 1
|
32
|
+
end
|
31
33
|
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
34
|
+
when :interp
|
35
|
+
while @pos < @tokens.size and !@tokens[@pos].interp?
|
36
|
+
@pos+= 1
|
37
|
+
end
|
38
|
+
|
39
|
+
when :number
|
40
|
+
while @pos < @tokens.size and !@tokens[@pos].number?
|
41
|
+
@pos+= 1
|
42
|
+
end
|
40
43
|
end
|
41
|
-
|
42
44
|
end
|
43
|
-
end
|
44
45
|
|
45
|
-
|
46
|
-
|
47
|
-
|
46
|
+
def current
|
47
|
+
|
48
|
+
if @pos == @tokens.size
|
48
49
|
nil
|
49
|
-
|
50
|
+
else
|
50
51
|
@tokens[@pos]
|
51
|
-
|
52
|
-
|
53
|
-
end
|
54
|
-
|
55
|
-
def index
|
56
|
-
@pos
|
57
|
-
end
|
58
|
-
|
59
|
-
def end?
|
60
|
-
@pos == tokens.size
|
61
|
-
end
|
62
|
-
|
63
|
-
|
64
|
-
private
|
65
|
-
|
66
|
-
def flatten_text(text)
|
67
|
-
flattened = []
|
68
|
-
text.each { |s| s.tokens.each {|t| flattened.push t } }
|
69
|
-
flattened
|
70
|
-
end
|
71
|
-
|
72
|
-
def load_lemated_text(text_file)
|
73
|
-
|
74
|
-
t1 = Thread.new do
|
75
|
-
`takipi -i #{text_file} -o output.xml -it TXT`
|
76
|
-
end
|
77
|
-
t1.join
|
52
|
+
end
|
78
53
|
|
54
|
+
end
|
79
55
|
|
56
|
+
def index
|
57
|
+
@pos
|
58
|
+
end
|
80
59
|
|
81
|
-
|
60
|
+
def end?
|
61
|
+
@pos == tokens.size
|
62
|
+
end
|
63
|
+
|
82
64
|
|
83
|
-
|
84
|
-
File.open("output.xml") do |f|
|
85
|
-
doc = Document.new(f)
|
65
|
+
private
|
86
66
|
|
87
|
-
|
88
|
-
|
89
|
-
|
67
|
+
def flatten_text(text)
|
68
|
+
flattened = []
|
69
|
+
text.each { |s| s.tokens.each {|t| flattened.push t } }
|
70
|
+
flattened
|
71
|
+
end
|
90
72
|
|
91
|
-
|
92
|
-
|
93
|
-
|
73
|
+
#Tok
|
74
|
+
|
75
|
+
def load_lemated_text(text)
|
94
76
|
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
77
|
+
t1 = Thread.new do
|
78
|
+
`echo #{text} | takipi -i -o output.xml -it TXT`
|
79
|
+
end
|
80
|
+
t1.join
|
81
|
+
|
82
|
+
text = []
|
83
|
+
File.open("output.xml") do |f|
|
84
|
+
doc = Document.new(f)
|
85
|
+
|
86
|
+
doc.elements.each("*/chunkList/chunk") do |chunk|
|
87
|
+
sentence = Sentence.new
|
88
|
+
tokens = []
|
89
|
+
|
90
|
+
chunk.elements.each("tok") do |tok|
|
91
|
+
word = tok.elements[1].text
|
92
|
+
lemat, inflect = ""
|
93
|
+
|
94
|
+
tok.elements.each("lex") do |lex|
|
95
|
+
if lex.has_attributes?
|
96
|
+
lemat = lex.elements[1].text
|
97
|
+
inflect = lex.elements[2].text
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
tokens << Word.new(word,lemat,inflect)
|
100
102
|
end
|
101
|
-
|
102
|
-
tokens << Word.new(word,lemat,inflect)
|
103
|
-
end
|
104
103
|
|
105
|
-
|
106
|
-
|
104
|
+
sentence << tokens
|
105
|
+
text << sentence
|
106
|
+
end
|
107
107
|
end
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
108
|
+
text
|
109
|
+
end
|
110
|
+
|
111
|
+
def lematize_text(text)
|
112
|
+
temp_text = []
|
113
|
+
text.split(/\.|!|\?/).each do |s|
|
114
|
+
sentence = Sentence.new
|
115
|
+
sentence << s.split(" ").collect{ |t|
|
116
|
+
if word = Morfeusz::Lexeme.find(t)
|
117
|
+
if word[0]
|
118
|
+
Word.new(t,word[0].base_form,"")
|
119
|
+
else
|
120
|
+
Word.new(t,"","")
|
121
|
+
end
|
122
|
+
else
|
121
123
|
Word.new(t,"","")
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
temp_text.push sentence
|
124
|
+
end
|
125
|
+
}
|
126
|
+
temp_text.push sentence
|
127
|
+
end
|
128
|
+
temp_text
|
128
129
|
end
|
129
|
-
temp_text
|
130
|
-
end
|
131
130
|
|
132
131
|
|
133
132
|
|
134
133
|
|
135
|
-
end
|
134
|
+
end
|
136
135
|
|
137
136
|
end
|
data/lib/word.rb
CHANGED
@@ -2,20 +2,20 @@ require 'inflectable'
|
|
2
2
|
require 'meaningable'
|
3
3
|
|
4
4
|
module NLP
|
5
|
-
class Word < Token
|
6
|
-
|
7
|
-
|
5
|
+
class Word < Token
|
6
|
+
include Inflectable
|
7
|
+
include Meaningable
|
8
8
|
|
9
9
|
attr_reader :lemat, :orth
|
10
|
-
|
11
|
-
|
10
|
+
|
11
|
+
def initialize(word, lemat, tags)
|
12
12
|
super(word,tags)
|
13
13
|
@lemat = lemat
|
14
|
-
|
14
|
+
end
|
15
15
|
|
16
16
|
def inflection
|
17
17
|
@tags
|
18
18
|
end
|
19
19
|
|
20
|
-
end
|
20
|
+
end
|
21
21
|
end
|
data/test/word_test.rb
ADDED
@@ -0,0 +1,42 @@
|
|
1
|
+
require 'helper'
|
2
|
+
require '../lib/word.rb'
|
3
|
+
class WordTest < Test::Unit::TestCase
|
4
|
+
def setup
|
5
|
+
@word_kota = NLP::Word.new('kota','kot','subst:sg:gen.acc:m2')
|
6
|
+
@word_siebie = NLP::Word.new('siebie','się','siebie:gen.acc')
|
7
|
+
end
|
8
|
+
|
9
|
+
def test_word_lematization
|
10
|
+
assert_equal 'kot', @word_kota.lemat
|
11
|
+
assert_equal 'się', @word_siebie.lemat
|
12
|
+
end
|
13
|
+
|
14
|
+
def test_word_orth
|
15
|
+
assert_equal 'kota', @word_kota.orth
|
16
|
+
assert_equal 'siebie', @word_siebie.orth
|
17
|
+
end
|
18
|
+
|
19
|
+
def test_recognizing_part_of_speech
|
20
|
+
assert @word_kota.rzeczownik?
|
21
|
+
assert @word_siebie.zaimek?
|
22
|
+
end
|
23
|
+
|
24
|
+
def test_recognizing_inflection
|
25
|
+
assert @word_kota.liczba_pojedyncza?
|
26
|
+
assert @word_kota.dopelniacz?
|
27
|
+
assert @word_kota.biernik?
|
28
|
+
assert @word_kota.meski_zwierzecy?
|
29
|
+
|
30
|
+
assert_equal false, @word_kota.liczba_mnoga?
|
31
|
+
assert_equal false, @word_kota.mianownik?
|
32
|
+
|
33
|
+
assert @word_siebie.biernik?
|
34
|
+
assert @word_siebie.dopelniacz?
|
35
|
+
end
|
36
|
+
|
37
|
+
def test_inflection_string
|
38
|
+
assert_equal @word_kota.inflection, 'subst:sg:gen.acc:m2'
|
39
|
+
end
|
40
|
+
|
41
|
+
|
42
|
+
end
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: nlp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 19
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 2
|
9
|
-
-
|
10
|
-
version: 0.2.
|
9
|
+
- 2
|
10
|
+
version: 0.2.2
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- knife
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2010-09-
|
18
|
+
date: 2010-09-06 00:00:00 +02:00
|
19
19
|
default_executable:
|
20
20
|
dependencies: []
|
21
21
|
|
@@ -51,6 +51,7 @@ files:
|
|
51
51
|
- README.rdoc
|
52
52
|
- test/helper.rb
|
53
53
|
- test/test_nlp.rb
|
54
|
+
- test/word_test.rb
|
54
55
|
has_rdoc: true
|
55
56
|
homepage: http://github.com/knife/nlp
|
56
57
|
licenses: []
|
@@ -88,3 +89,4 @@ summary: Linguistics tools for processing polish language.
|
|
88
89
|
test_files:
|
89
90
|
- test/helper.rb
|
90
91
|
- test/test_nlp.rb
|
92
|
+
- test/word_test.rb
|