nlp 0.2.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/analyzer.rb CHANGED
@@ -1,12 +1,10 @@
1
1
  require 'dictionary'
2
- require 'morfeusz'
2
+ #require 'morfeusz'
3
3
  require 'token'
4
4
  require 'word'
5
5
  require 'emoticon'
6
6
  require 'sentence'
7
7
  require "token_scanner.rb"
8
- require "inflectable"
9
- require "meaningable"
10
8
 
11
9
  $KODE = "UTF8"
12
10
 
@@ -15,7 +13,6 @@ module NLP
15
13
  class Analyzer
16
14
 
17
15
  include REXML
18
- #Lexeme = Apohllo::Morfeusz::Lexeme
19
16
 
20
17
  def initialize( category_file, restore = true )
21
18
  state_file = File.expand_path(Dictionary::CACHE_DIR)
@@ -46,7 +43,6 @@ module NLP
46
43
  unless categories.nil?
47
44
  categories.each do |category|
48
45
 
49
- puts "#{word} : #{category.name}"
50
46
  results[:scores][category] = results[:scores][category] + 1
51
47
  end
52
48
 
data/lib/dictionary.rb CHANGED
@@ -59,7 +59,6 @@ module NLP
59
59
  @tree.insert( word, category )
60
60
  end
61
61
  rescue
62
- puts "Error for line: #{line}"
63
62
  raise
64
63
  end
65
64
  end
data/lib/emoticon.rb CHANGED
@@ -1,5 +1,6 @@
1
+ require 'meaningable'
1
2
  module NLP
2
- class Emoticon < Token
3
+ class Emoticon < Token
3
4
  include Meaningable
4
5
 
5
6
  def initialize(tokens,tags)
@@ -8,6 +9,6 @@ class Emoticon < Token
8
9
  end
9
10
 
10
11
 
11
- end
12
+ end
12
13
  end
13
14
 
data/lib/inflectable.rb CHANGED
@@ -2,8 +2,8 @@ module Inflectable
2
2
 
3
3
  GRAM_CAT = {
4
4
  #rzeczownik
5
- [:subst, :depr] => 'rzeczownik',
6
5
  :adj => 'przymiotnik',
6
+ [:subst,:depr] => 'rzeczownik',
7
7
  :adv => 'przyslowek',
8
8
  :num => 'liczebnik',
9
9
  [:pron,:siebie] => 'zaimek',
@@ -22,15 +22,15 @@ module Inflectable
22
22
  :voc => 'wolacz',
23
23
 
24
24
  #Rodzaje
25
- :m1 => 'męski_osobowy',
26
- :m2 => 'męski_zwierzęcy',
27
- :m3 => 'męski_rzeczowy',
28
- :f => 'żeński',
29
- :n1 => 'nijaki zbiorowy',
30
- :n2 => 'nijaki zwykły',
31
- :p1 => 'przymnogi osobowy',
32
- :p2 => 'przymnogi zwykły',
33
- :p3 => 'przymnogi opisowy',
25
+ :m1 => 'meski_osobowy',
26
+ :m2 => 'meski_zwierzecy',
27
+ :m3 => 'meski_rzeczowy',
28
+ :f => 'zenski',
29
+ :n1 => 'nijaki_zbiorowy',
30
+ :n2 => 'nijaki zwykly',
31
+ :p1 => 'przymnogi_osobowy',
32
+ :p2 => 'przymnogi_zwykly',
33
+ :p3 => 'przymnogi_opisowy',
34
34
 
35
35
  #Osoby
36
36
  :pri => "pierwsza_osoba",
@@ -44,16 +44,18 @@ module Inflectable
44
44
  }
45
45
 
46
46
  GRAM_CAT.each do |key,value|
47
- if key.kind_of? Array
48
- key = key.first
49
- else
50
- define_method(value+"?"){
51
- inflection.split(":").any?{|e| e.include? key.to_s[1..-1]}
52
- }
53
- end
47
+
48
+ define_method(value+"?"){
49
+ inflection.split(":").any?{|e|
50
+ if key.is_a? Array
51
+ key.any?{|k| e.include? k.to_s}
52
+ else
53
+ e.include? key.to_s
54
+ end
55
+ }
56
+ }
54
57
  end
55
58
 
56
59
 
57
-
58
60
 
59
61
  end
data/lib/sentence.rb CHANGED
@@ -1,14 +1,12 @@
1
1
  module NLP
2
- class Sentence
3
- attr_reader :tokens
4
- def initialize()
5
- @tokens = []
6
- end
2
+ class Sentence
3
+ attr_reader :tokens
4
+ def initialize()
5
+ @tokens = []
6
+ end
7
7
 
8
- def << tokens
9
- @tokens.concat tokens
8
+ def << tokens
9
+ @tokens.concat tokens
10
+ end
10
11
  end
11
-
12
-
13
- end
14
12
  end
data/lib/token.rb CHANGED
@@ -1,35 +1,34 @@
1
- require 'inflectable'
1
+
2
2
  module NLP
3
- class Token
4
- attr_reader :orth
5
- attr_reader :tags
6
-
7
-
8
- def initialize(orth,tags)
9
- @orth = orth
10
- @tags = tags
11
- end
3
+ class Token
4
+ attr_reader :orth
5
+ attr_reader :tags
12
6
 
13
- def interp?
14
- @tags.eql? "interp"
15
- end
7
+
8
+ def initialize(orth,tags)
9
+ @orth = orth
10
+ @tags = tags
11
+ end
16
12
 
17
- def word?
18
- not interp? and not number?
19
- end
13
+ def interp?
14
+ @tags.eql? "interp"
15
+ end
20
16
 
21
- def number?
22
- @tags.include?("tnum")
23
- end
17
+ def word?
18
+ not interp? and not number?
19
+ end
24
20
 
25
- def integer?
26
- @tags.include?("tnum:integer")
27
- end
21
+ def number?
22
+ @tags.include?("tnum")
23
+ end
28
24
 
29
- def float?
30
- @tags.include?("tnum:frac")
31
- end
25
+ def integer?
26
+ @tags.include?("tnum:integer")
27
+ end
32
28
 
29
+ def float?
30
+ @tags.include?("tnum:frac")
31
+ end
33
32
 
34
- end
33
+ end
35
34
  end
data/lib/token_scanner.rb CHANGED
@@ -1,137 +1,136 @@
1
1
 
2
2
  require 'rexml/document'
3
3
  require 'soap/rpc/driver'
4
- module NLP
5
- class TokenScanner
6
- include REXML
7
- attr_reader :text, :tokens
8
4
 
9
- def initialize(text, method)
10
- @pos = 0
5
+ module NLP
6
+ class TokenScanner
7
+ include REXML
11
8
 
12
- if method === :file
13
- puts "laduje tekst"
14
- @text = load_lemated_text(text)
15
- elsif method === :text
16
- @text = lematize_text(text)
17
- else
18
- @text = text
9
+ attr_reader :text, :tokens
10
+
11
+ def initialize(text, method)
12
+
13
+ if method === :takipi
14
+ @text = load_lemated_text(text)
15
+ elsif method === :morfeusz
16
+ @text = lematize_text(text)
17
+ else
18
+ @text = text
19
+ end
20
+
21
+ @pos = 0
22
+ @tokens = flatten_text(@text)
19
23
  end
20
24
 
21
- @tokens = flatten_text(@text)
22
- end
25
+ def next(type)
26
+ @pos+=1
23
27
 
24
- def next(type)
25
- @pos+=1
26
- case type
27
- when :word
28
- while @pos < @tokens.size and !@tokens[@pos].word?
29
- @pos+= 1
30
- end
28
+ case type
29
+ when :word
30
+ while @pos < @tokens.size and !@tokens[@pos].word?
31
+ @pos+= 1
32
+ end
31
33
 
32
- when :interp
33
- while @pos < @tokens.size and !@tokens[@pos].interp?
34
- @pos+= 1
35
- end
36
-
37
- when :number
38
- while @pos < @tokens.size and !@tokens[@pos].number?
39
- @pos+= 1
34
+ when :interp
35
+ while @pos < @tokens.size and !@tokens[@pos].interp?
36
+ @pos+= 1
37
+ end
38
+
39
+ when :number
40
+ while @pos < @tokens.size and !@tokens[@pos].number?
41
+ @pos+= 1
42
+ end
40
43
  end
41
-
42
44
  end
43
- end
44
45
 
45
- def current
46
-
47
- if @pos == @tokens.size
46
+ def current
47
+
48
+ if @pos == @tokens.size
48
49
  nil
49
- else
50
+ else
50
51
  @tokens[@pos]
51
- end
52
-
53
- end
54
-
55
- def index
56
- @pos
57
- end
58
-
59
- def end?
60
- @pos == tokens.size
61
- end
62
-
63
-
64
- private
65
-
66
- def flatten_text(text)
67
- flattened = []
68
- text.each { |s| s.tokens.each {|t| flattened.push t } }
69
- flattened
70
- end
71
-
72
- def load_lemated_text(text_file)
73
-
74
- t1 = Thread.new do
75
- `takipi -i #{text_file} -o output.xml -it TXT`
76
- end
77
- t1.join
52
+ end
78
53
 
54
+ end
79
55
 
56
+ def index
57
+ @pos
58
+ end
80
59
 
81
-
60
+ def end?
61
+ @pos == tokens.size
62
+ end
63
+
82
64
 
83
- text = []
84
- File.open("output.xml") do |f|
85
- doc = Document.new(f)
65
+ private
86
66
 
87
- doc.elements.each("*/chunkList/chunk") do |chunk|
88
- sentence = Sentence.new
89
- tokens = []
67
+ def flatten_text(text)
68
+ flattened = []
69
+ text.each { |s| s.tokens.each {|t| flattened.push t } }
70
+ flattened
71
+ end
90
72
 
91
- chunk.elements.each("tok") do |tok|
92
- word = tok.elements[1].text
93
- lemat, inflect = ""
73
+ #Tok
74
+
75
+ def load_lemated_text(text)
94
76
 
95
- tok.elements.each("lex") do |lex|
96
- if lex.has_attributes?
97
- lemat = lex.elements[1].text
98
- inflect = lex.elements[2].text
99
- end
77
+ t1 = Thread.new do
78
+ `echo #{text} | takipi -i -o output.xml -it TXT`
79
+ end
80
+ t1.join
81
+
82
+ text = []
83
+ File.open("output.xml") do |f|
84
+ doc = Document.new(f)
85
+
86
+ doc.elements.each("*/chunkList/chunk") do |chunk|
87
+ sentence = Sentence.new
88
+ tokens = []
89
+
90
+ chunk.elements.each("tok") do |tok|
91
+ word = tok.elements[1].text
92
+ lemat, inflect = ""
93
+
94
+ tok.elements.each("lex") do |lex|
95
+ if lex.has_attributes?
96
+ lemat = lex.elements[1].text
97
+ inflect = lex.elements[2].text
98
+ end
99
+ end
100
+
101
+ tokens << Word.new(word,lemat,inflect)
100
102
  end
101
-
102
- tokens << Word.new(word,lemat,inflect)
103
- end
104
103
 
105
- sentence << tokens
106
- text << sentence
104
+ sentence << tokens
105
+ text << sentence
106
+ end
107
107
  end
108
- end
109
- text
110
- end
111
-
112
- def lematize_text(text)
113
- temp_text = []
114
- text.split(/\.|!|\?/).each do |s|
115
- sentence = Sentence.new
116
- sentence << s.split(" ").collect{ |t|
117
- if word = Morfeusz::Lexeme.find(t)
118
- if word[0]
119
- Word.new(t,word[0].base_form,"")
120
- else
108
+ text
109
+ end
110
+
111
+ def lematize_text(text)
112
+ temp_text = []
113
+ text.split(/\.|!|\?/).each do |s|
114
+ sentence = Sentence.new
115
+ sentence << s.split(" ").collect{ |t|
116
+ if word = Morfeusz::Lexeme.find(t)
117
+ if word[0]
118
+ Word.new(t,word[0].base_form,"")
119
+ else
120
+ Word.new(t,"","")
121
+ end
122
+ else
121
123
  Word.new(t,"","")
122
- end
123
- else
124
- Word.new(t,"","")
125
- end
126
- }
127
- temp_text.push sentence
124
+ end
125
+ }
126
+ temp_text.push sentence
127
+ end
128
+ temp_text
128
129
  end
129
- temp_text
130
- end
131
130
 
132
131
 
133
132
 
134
133
 
135
- end
134
+ end
136
135
 
137
136
  end
data/lib/word.rb CHANGED
@@ -2,20 +2,20 @@ require 'inflectable'
2
2
  require 'meaningable'
3
3
 
4
4
  module NLP
5
- class Word < Token
6
- include Inflectable
7
- include Meaningable
5
+ class Word < Token
6
+ include Inflectable
7
+ include Meaningable
8
8
 
9
9
  attr_reader :lemat, :orth
10
-
11
- def initialize(word, lemat, tags)
10
+
11
+ def initialize(word, lemat, tags)
12
12
  super(word,tags)
13
13
  @lemat = lemat
14
- end
14
+ end
15
15
 
16
16
  def inflection
17
17
  @tags
18
18
  end
19
19
 
20
- end
20
+ end
21
21
  end
data/test/word_test.rb ADDED
@@ -0,0 +1,42 @@
1
+ require 'helper'
2
+ require '../lib/word.rb'
3
+ class WordTest < Test::Unit::TestCase
4
+ def setup
5
+ @word_kota = NLP::Word.new('kota','kot','subst:sg:gen.acc:m2')
6
+ @word_siebie = NLP::Word.new('siebie','się','siebie:gen.acc')
7
+ end
8
+
9
+ def test_word_lematization
10
+ assert_equal 'kot', @word_kota.lemat
11
+ assert_equal 'się', @word_siebie.lemat
12
+ end
13
+
14
+ def test_word_orth
15
+ assert_equal 'kota', @word_kota.orth
16
+ assert_equal 'siebie', @word_siebie.orth
17
+ end
18
+
19
+ def test_recognizing_part_of_speech
20
+ assert @word_kota.rzeczownik?
21
+ assert @word_siebie.zaimek?
22
+ end
23
+
24
+ def test_recognizing_inflection
25
+ assert @word_kota.liczba_pojedyncza?
26
+ assert @word_kota.dopelniacz?
27
+ assert @word_kota.biernik?
28
+ assert @word_kota.meski_zwierzecy?
29
+
30
+ assert_equal false, @word_kota.liczba_mnoga?
31
+ assert_equal false, @word_kota.mianownik?
32
+
33
+ assert @word_siebie.biernik?
34
+ assert @word_siebie.dopelniacz?
35
+ end
36
+
37
+ def test_inflection_string
38
+ assert_equal @word_kota.inflection, 'subst:sg:gen.acc:m2'
39
+ end
40
+
41
+
42
+ end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: nlp
3
3
  version: !ruby/object:Gem::Version
4
- hash: 21
4
+ hash: 19
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
8
  - 2
9
- - 1
10
- version: 0.2.1
9
+ - 2
10
+ version: 0.2.2
11
11
  platform: ruby
12
12
  authors:
13
13
  - knife
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-09-04 00:00:00 +02:00
18
+ date: 2010-09-06 00:00:00 +02:00
19
19
  default_executable:
20
20
  dependencies: []
21
21
 
@@ -51,6 +51,7 @@ files:
51
51
  - README.rdoc
52
52
  - test/helper.rb
53
53
  - test/test_nlp.rb
54
+ - test/word_test.rb
54
55
  has_rdoc: true
55
56
  homepage: http://github.com/knife/nlp
56
57
  licenses: []
@@ -88,3 +89,4 @@ summary: Linguistics tools for processing polish language.
88
89
  test_files:
89
90
  - test/helper.rb
90
91
  - test/test_nlp.rb
92
+ - test/word_test.rb