nlp 0.2.1 → 0.2.2

Sign up to get free protection for your applications and to get access to all the features.
data/lib/analyzer.rb CHANGED
@@ -1,12 +1,10 @@
1
1
  require 'dictionary'
2
- require 'morfeusz'
2
+ #require 'morfeusz'
3
3
  require 'token'
4
4
  require 'word'
5
5
  require 'emoticon'
6
6
  require 'sentence'
7
7
  require "token_scanner.rb"
8
- require "inflectable"
9
- require "meaningable"
10
8
 
11
9
  $KODE = "UTF8"
12
10
 
@@ -15,7 +13,6 @@ module NLP
15
13
  class Analyzer
16
14
 
17
15
  include REXML
18
- #Lexeme = Apohllo::Morfeusz::Lexeme
19
16
 
20
17
  def initialize( category_file, restore = true )
21
18
  state_file = File.expand_path(Dictionary::CACHE_DIR)
@@ -46,7 +43,6 @@ module NLP
46
43
  unless categories.nil?
47
44
  categories.each do |category|
48
45
 
49
- puts "#{word} : #{category.name}"
50
46
  results[:scores][category] = results[:scores][category] + 1
51
47
  end
52
48
 
data/lib/dictionary.rb CHANGED
@@ -59,7 +59,6 @@ module NLP
59
59
  @tree.insert( word, category )
60
60
  end
61
61
  rescue
62
- puts "Error for line: #{line}"
63
62
  raise
64
63
  end
65
64
  end
data/lib/emoticon.rb CHANGED
@@ -1,5 +1,6 @@
1
+ require 'meaningable'
1
2
  module NLP
2
- class Emoticon < Token
3
+ class Emoticon < Token
3
4
  include Meaningable
4
5
 
5
6
  def initialize(tokens,tags)
@@ -8,6 +9,6 @@ class Emoticon < Token
8
9
  end
9
10
 
10
11
 
11
- end
12
+ end
12
13
  end
13
14
 
data/lib/inflectable.rb CHANGED
@@ -2,8 +2,8 @@ module Inflectable
2
2
 
3
3
  GRAM_CAT = {
4
4
  #rzeczownik
5
- [:subst, :depr] => 'rzeczownik',
6
5
  :adj => 'przymiotnik',
6
+ [:subst,:depr] => 'rzeczownik',
7
7
  :adv => 'przyslowek',
8
8
  :num => 'liczebnik',
9
9
  [:pron,:siebie] => 'zaimek',
@@ -22,15 +22,15 @@ module Inflectable
22
22
  :voc => 'wolacz',
23
23
 
24
24
  #Rodzaje
25
- :m1 => 'męski_osobowy',
26
- :m2 => 'męski_zwierzęcy',
27
- :m3 => 'męski_rzeczowy',
28
- :f => 'żeński',
29
- :n1 => 'nijaki zbiorowy',
30
- :n2 => 'nijaki zwykły',
31
- :p1 => 'przymnogi osobowy',
32
- :p2 => 'przymnogi zwykły',
33
- :p3 => 'przymnogi opisowy',
25
+ :m1 => 'meski_osobowy',
26
+ :m2 => 'meski_zwierzecy',
27
+ :m3 => 'meski_rzeczowy',
28
+ :f => 'zenski',
29
+ :n1 => 'nijaki_zbiorowy',
30
+ :n2 => 'nijaki zwykly',
31
+ :p1 => 'przymnogi_osobowy',
32
+ :p2 => 'przymnogi_zwykly',
33
+ :p3 => 'przymnogi_opisowy',
34
34
 
35
35
  #Osoby
36
36
  :pri => "pierwsza_osoba",
@@ -44,16 +44,18 @@ module Inflectable
44
44
  }
45
45
 
46
46
  GRAM_CAT.each do |key,value|
47
- if key.kind_of? Array
48
- key = key.first
49
- else
50
- define_method(value+"?"){
51
- inflection.split(":").any?{|e| e.include? key.to_s[1..-1]}
52
- }
53
- end
47
+
48
+ define_method(value+"?"){
49
+ inflection.split(":").any?{|e|
50
+ if key.is_a? Array
51
+ key.any?{|k| e.include? k.to_s}
52
+ else
53
+ e.include? key.to_s
54
+ end
55
+ }
56
+ }
54
57
  end
55
58
 
56
59
 
57
-
58
60
 
59
61
  end
data/lib/sentence.rb CHANGED
@@ -1,14 +1,12 @@
1
1
  module NLP
2
- class Sentence
3
- attr_reader :tokens
4
- def initialize()
5
- @tokens = []
6
- end
2
+ class Sentence
3
+ attr_reader :tokens
4
+ def initialize()
5
+ @tokens = []
6
+ end
7
7
 
8
- def << tokens
9
- @tokens.concat tokens
8
+ def << tokens
9
+ @tokens.concat tokens
10
+ end
10
11
  end
11
-
12
-
13
- end
14
12
  end
data/lib/token.rb CHANGED
@@ -1,35 +1,34 @@
1
- require 'inflectable'
1
+
2
2
  module NLP
3
- class Token
4
- attr_reader :orth
5
- attr_reader :tags
6
-
7
-
8
- def initialize(orth,tags)
9
- @orth = orth
10
- @tags = tags
11
- end
3
+ class Token
4
+ attr_reader :orth
5
+ attr_reader :tags
12
6
 
13
- def interp?
14
- @tags.eql? "interp"
15
- end
7
+
8
+ def initialize(orth,tags)
9
+ @orth = orth
10
+ @tags = tags
11
+ end
16
12
 
17
- def word?
18
- not interp? and not number?
19
- end
13
+ def interp?
14
+ @tags.eql? "interp"
15
+ end
20
16
 
21
- def number?
22
- @tags.include?("tnum")
23
- end
17
+ def word?
18
+ not interp? and not number?
19
+ end
24
20
 
25
- def integer?
26
- @tags.include?("tnum:integer")
27
- end
21
+ def number?
22
+ @tags.include?("tnum")
23
+ end
28
24
 
29
- def float?
30
- @tags.include?("tnum:frac")
31
- end
25
+ def integer?
26
+ @tags.include?("tnum:integer")
27
+ end
32
28
 
29
+ def float?
30
+ @tags.include?("tnum:frac")
31
+ end
33
32
 
34
- end
33
+ end
35
34
  end
data/lib/token_scanner.rb CHANGED
@@ -1,137 +1,136 @@
1
1
 
2
2
  require 'rexml/document'
3
3
  require 'soap/rpc/driver'
4
- module NLP
5
- class TokenScanner
6
- include REXML
7
- attr_reader :text, :tokens
8
4
 
9
- def initialize(text, method)
10
- @pos = 0
5
+ module NLP
6
+ class TokenScanner
7
+ include REXML
11
8
 
12
- if method === :file
13
- puts "laduje tekst"
14
- @text = load_lemated_text(text)
15
- elsif method === :text
16
- @text = lematize_text(text)
17
- else
18
- @text = text
9
+ attr_reader :text, :tokens
10
+
11
+ def initialize(text, method)
12
+
13
+ if method === :takipi
14
+ @text = load_lemated_text(text)
15
+ elsif method === :morfeusz
16
+ @text = lematize_text(text)
17
+ else
18
+ @text = text
19
+ end
20
+
21
+ @pos = 0
22
+ @tokens = flatten_text(@text)
19
23
  end
20
24
 
21
- @tokens = flatten_text(@text)
22
- end
25
+ def next(type)
26
+ @pos+=1
23
27
 
24
- def next(type)
25
- @pos+=1
26
- case type
27
- when :word
28
- while @pos < @tokens.size and !@tokens[@pos].word?
29
- @pos+= 1
30
- end
28
+ case type
29
+ when :word
30
+ while @pos < @tokens.size and !@tokens[@pos].word?
31
+ @pos+= 1
32
+ end
31
33
 
32
- when :interp
33
- while @pos < @tokens.size and !@tokens[@pos].interp?
34
- @pos+= 1
35
- end
36
-
37
- when :number
38
- while @pos < @tokens.size and !@tokens[@pos].number?
39
- @pos+= 1
34
+ when :interp
35
+ while @pos < @tokens.size and !@tokens[@pos].interp?
36
+ @pos+= 1
37
+ end
38
+
39
+ when :number
40
+ while @pos < @tokens.size and !@tokens[@pos].number?
41
+ @pos+= 1
42
+ end
40
43
  end
41
-
42
44
  end
43
- end
44
45
 
45
- def current
46
-
47
- if @pos == @tokens.size
46
+ def current
47
+
48
+ if @pos == @tokens.size
48
49
  nil
49
- else
50
+ else
50
51
  @tokens[@pos]
51
- end
52
-
53
- end
54
-
55
- def index
56
- @pos
57
- end
58
-
59
- def end?
60
- @pos == tokens.size
61
- end
62
-
63
-
64
- private
65
-
66
- def flatten_text(text)
67
- flattened = []
68
- text.each { |s| s.tokens.each {|t| flattened.push t } }
69
- flattened
70
- end
71
-
72
- def load_lemated_text(text_file)
73
-
74
- t1 = Thread.new do
75
- `takipi -i #{text_file} -o output.xml -it TXT`
76
- end
77
- t1.join
52
+ end
78
53
 
54
+ end
79
55
 
56
+ def index
57
+ @pos
58
+ end
80
59
 
81
-
60
+ def end?
61
+ @pos == tokens.size
62
+ end
63
+
82
64
 
83
- text = []
84
- File.open("output.xml") do |f|
85
- doc = Document.new(f)
65
+ private
86
66
 
87
- doc.elements.each("*/chunkList/chunk") do |chunk|
88
- sentence = Sentence.new
89
- tokens = []
67
+ def flatten_text(text)
68
+ flattened = []
69
+ text.each { |s| s.tokens.each {|t| flattened.push t } }
70
+ flattened
71
+ end
90
72
 
91
- chunk.elements.each("tok") do |tok|
92
- word = tok.elements[1].text
93
- lemat, inflect = ""
73
+ #Tok
74
+
75
+ def load_lemated_text(text)
94
76
 
95
- tok.elements.each("lex") do |lex|
96
- if lex.has_attributes?
97
- lemat = lex.elements[1].text
98
- inflect = lex.elements[2].text
99
- end
77
+ t1 = Thread.new do
78
+ `echo #{text} | takipi -i -o output.xml -it TXT`
79
+ end
80
+ t1.join
81
+
82
+ text = []
83
+ File.open("output.xml") do |f|
84
+ doc = Document.new(f)
85
+
86
+ doc.elements.each("*/chunkList/chunk") do |chunk|
87
+ sentence = Sentence.new
88
+ tokens = []
89
+
90
+ chunk.elements.each("tok") do |tok|
91
+ word = tok.elements[1].text
92
+ lemat, inflect = ""
93
+
94
+ tok.elements.each("lex") do |lex|
95
+ if lex.has_attributes?
96
+ lemat = lex.elements[1].text
97
+ inflect = lex.elements[2].text
98
+ end
99
+ end
100
+
101
+ tokens << Word.new(word,lemat,inflect)
100
102
  end
101
-
102
- tokens << Word.new(word,lemat,inflect)
103
- end
104
103
 
105
- sentence << tokens
106
- text << sentence
104
+ sentence << tokens
105
+ text << sentence
106
+ end
107
107
  end
108
- end
109
- text
110
- end
111
-
112
- def lematize_text(text)
113
- temp_text = []
114
- text.split(/\.|!|\?/).each do |s|
115
- sentence = Sentence.new
116
- sentence << s.split(" ").collect{ |t|
117
- if word = Morfeusz::Lexeme.find(t)
118
- if word[0]
119
- Word.new(t,word[0].base_form,"")
120
- else
108
+ text
109
+ end
110
+
111
+ def lematize_text(text)
112
+ temp_text = []
113
+ text.split(/\.|!|\?/).each do |s|
114
+ sentence = Sentence.new
115
+ sentence << s.split(" ").collect{ |t|
116
+ if word = Morfeusz::Lexeme.find(t)
117
+ if word[0]
118
+ Word.new(t,word[0].base_form,"")
119
+ else
120
+ Word.new(t,"","")
121
+ end
122
+ else
121
123
  Word.new(t,"","")
122
- end
123
- else
124
- Word.new(t,"","")
125
- end
126
- }
127
- temp_text.push sentence
124
+ end
125
+ }
126
+ temp_text.push sentence
127
+ end
128
+ temp_text
128
129
  end
129
- temp_text
130
- end
131
130
 
132
131
 
133
132
 
134
133
 
135
- end
134
+ end
136
135
 
137
136
  end
data/lib/word.rb CHANGED
@@ -2,20 +2,20 @@ require 'inflectable'
2
2
  require 'meaningable'
3
3
 
4
4
  module NLP
5
- class Word < Token
6
- include Inflectable
7
- include Meaningable
5
+ class Word < Token
6
+ include Inflectable
7
+ include Meaningable
8
8
 
9
9
  attr_reader :lemat, :orth
10
-
11
- def initialize(word, lemat, tags)
10
+
11
+ def initialize(word, lemat, tags)
12
12
  super(word,tags)
13
13
  @lemat = lemat
14
- end
14
+ end
15
15
 
16
16
  def inflection
17
17
  @tags
18
18
  end
19
19
 
20
- end
20
+ end
21
21
  end
data/test/word_test.rb ADDED
@@ -0,0 +1,42 @@
1
+ require 'helper'
2
+ require '../lib/word.rb'
3
+ class WordTest < Test::Unit::TestCase
4
+ def setup
5
+ @word_kota = NLP::Word.new('kota','kot','subst:sg:gen.acc:m2')
6
+ @word_siebie = NLP::Word.new('siebie','się','siebie:gen.acc')
7
+ end
8
+
9
+ def test_word_lematization
10
+ assert_equal 'kot', @word_kota.lemat
11
+ assert_equal 'się', @word_siebie.lemat
12
+ end
13
+
14
+ def test_word_orth
15
+ assert_equal 'kota', @word_kota.orth
16
+ assert_equal 'siebie', @word_siebie.orth
17
+ end
18
+
19
+ def test_recognizing_part_of_speech
20
+ assert @word_kota.rzeczownik?
21
+ assert @word_siebie.zaimek?
22
+ end
23
+
24
+ def test_recognizing_inflection
25
+ assert @word_kota.liczba_pojedyncza?
26
+ assert @word_kota.dopelniacz?
27
+ assert @word_kota.biernik?
28
+ assert @word_kota.meski_zwierzecy?
29
+
30
+ assert_equal false, @word_kota.liczba_mnoga?
31
+ assert_equal false, @word_kota.mianownik?
32
+
33
+ assert @word_siebie.biernik?
34
+ assert @word_siebie.dopelniacz?
35
+ end
36
+
37
+ def test_inflection_string
38
+ assert_equal @word_kota.inflection, 'subst:sg:gen.acc:m2'
39
+ end
40
+
41
+
42
+ end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: nlp
3
3
  version: !ruby/object:Gem::Version
4
- hash: 21
4
+ hash: 19
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
8
  - 2
9
- - 1
10
- version: 0.2.1
9
+ - 2
10
+ version: 0.2.2
11
11
  platform: ruby
12
12
  authors:
13
13
  - knife
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-09-04 00:00:00 +02:00
18
+ date: 2010-09-06 00:00:00 +02:00
19
19
  default_executable:
20
20
  dependencies: []
21
21
 
@@ -51,6 +51,7 @@ files:
51
51
  - README.rdoc
52
52
  - test/helper.rb
53
53
  - test/test_nlp.rb
54
+ - test/word_test.rb
54
55
  has_rdoc: true
55
56
  homepage: http://github.com/knife/nlp
56
57
  licenses: []
@@ -88,3 +89,4 @@ summary: Linguistics tools for processing polish language.
88
89
  test_files:
89
90
  - test/helper.rb
90
91
  - test/test_nlp.rb
92
+ - test/word_test.rb