nlp 0.2.5 → 0.2.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/analyzer.rb CHANGED
@@ -9,72 +9,40 @@ require 'sentence'
9
9
  require "token_scanner.rb"
10
10
  require "lemmatizer"
11
11
  require 'jcode'
12
+ require 'statistic'
12
13
  $KODE = "UTF8"
13
14
 
14
15
  module NLP
15
16
 
16
17
  class Analyzer
17
18
 
18
- CACHE_DIR = '~/'
19
-
20
- def initialize( category_file, restore = true )
21
- state_file = File.expand_path(Analyzer::CACHE_DIR)
22
- if restore
23
- @dictionary = Dictionary.restore(state_file)
24
- else
25
- @dictionary = Dictionary.new
26
- @dictionary.load_categories( category_file, :rid => true )
27
- @dictionary.store(state_file)
28
- end
29
-
19
+ def initialize(dict)
20
+ @dictionary = Dictionary.new(dict)
30
21
  end
31
-
32
-
33
- def analyze( scanner)
34
-
35
- results = {
36
- :word_count => 0,
37
- :word_total => 0,
38
- :scores => Hash.new { 0 },
39
- :words => []
40
- }
41
22
 
42
23
 
24
+ def analyze(scanner)
25
+
26
+ results = Statistic.new
43
27
 
44
- while token = scanner.current
28
+ while token = scanner.current
45
29
  word = token.lemat
46
30
 
47
- categories = @dictionary.find( word.gsub( /[^\w-]/, "" ) )
31
+ categories = @dictionary.find(word.gsub(/[^\w-]/, "" ))
48
32
  unless categories.nil?
49
- categories.each do |category|
50
- puts "Znalazłem słowo #{word} : #{category}"
51
- results[:scores][category] = results[:scores][category] + 1
52
- results[:word_count] += 1
53
- results[:words].push word
54
- end
55
-
56
-
33
+ categories.each do |category|
34
+ puts "Znalazłem słowo #{word} : #{category}"
35
+ results.add(word,category)
36
+ end
57
37
  end
58
38
 
59
- results[:word_total] += 1
39
+ results.total_words += 1
60
40
  scanner.next(:word)
61
- end
62
-
63
- results[:sorted_scores] = results[:scores].to_a.sort_by { |result| -result[1] }
64
- primary_sum = results[:sorted_scores].select { |result| result[0].primary? }.inject( 0 ) { |count,result| count + result[1] }
65
- secondary_sum = results[:sorted_scores].select { |result| result[0].secondary? }.inject( 0 ) { |count,result| count + result[1] }
66
- emotion_sum = results[:sorted_scores].select { |result| result[0].emotions? }.inject( 0 ) { |count,result| count + result[1] }
67
-
41
+ end
68
42
 
69
- results[:classes] = {
70
- :primary => Float(primary_sum) / results[:word_count],
71
- :secondary => Float(secondary_sum) / results[:word_count],
72
- :emotions => Float(emotion_sum) / results[:word_count]
73
- }
74
-
75
43
  results
76
- end
77
44
 
45
+ end
78
46
  end
79
47
  end
80
48
 
data/lib/category.rb CHANGED
@@ -1,16 +1,16 @@
1
1
  module NLP
2
2
  class Category
3
3
  attr_reader :parent, :name
4
-
5
- def initialize( name, parent = nil )
4
+
5
+ def initialize(name, parent = nil)
6
6
  @parent = parent
7
7
  @name = name.to_sym
8
8
  end
9
-
9
+
10
10
  def path
11
- @parent ? ( @parent.path + '/' + name.to_s ) : name.to_s
11
+ @parent ? (@parent.path + '/' + name.to_s) : name.to_s
12
12
  end
13
-
13
+
14
14
  def root
15
15
  category = self
16
16
  while category.parent != nil
@@ -18,11 +18,10 @@ module NLP
18
18
  end
19
19
  category.name
20
20
  end
21
-
21
+
22
22
  def to_s
23
23
  "#{path.inspect}"
24
24
  end
25
-
26
-
25
+
27
26
  end
28
27
  end
data/lib/dictionary.rb CHANGED
@@ -5,12 +5,24 @@ require 'rid_category'
5
5
  require 'liwc_category'
6
6
 
7
7
  module NLP
8
+
8
9
  class Dictionary
9
- def initialize
10
- @tree = SearchTree.new
11
- @categories = {}
10
+ attr_accessor :tree
11
+
12
+
13
+ def initialize(category_file=:rid,restore = true)
14
+ state_file = File.expand_path(DICTIONARY_CACHE_DIR+".#{category_file.to_s}")
15
+ if restore and File.exist?(state_file)
16
+ d = Dictionary.restore(state_file)
17
+ @tree = d.tree
18
+ else
19
+ @tree = SearchTree.new
20
+ load_categories(File.dirname(__FILE__)+"/../dict/#{category_file.to_s}", category_file )
21
+ store(state_file)
22
+ end
23
+
12
24
  end
13
-
25
+
14
26
  def store( state_file )
15
27
  File.open( File.expand_path( state_file ), "w" ) do |file|
16
28
  Marshal.dump( self, file )
@@ -24,50 +36,40 @@ module NLP
24
36
  end
25
37
  end
26
38
 
27
-
28
-
29
- def find( word )
39
+ def find(word)
30
40
  if @exception_pattern && @exception_pattern =~ word
31
41
  nil
32
42
  else
33
- @tree.find( word )
43
+ @tree.find(word)
34
44
  end
35
45
  end
36
-
37
46
 
38
- def load_categories( category_file,options )
47
+
48
+ def load_categories(category_file,type)
39
49
  category = nil
40
50
  primary = nil
41
51
  secondary = nil
42
52
  tertiary = nil
43
-
53
+
54
+ if type == :rid
55
+ cat_class = NLP.const_get("RIDCategory")
56
+ else
57
+ cat_class = NLP.const_get("LIWCCategory")
58
+ end
59
+
44
60
  File.open( category_file ) do |file|
45
61
  while line = file.gets
46
62
  line.chomp!
47
63
  begin
48
64
  lead, rest = line.scan( /(\t*)(.*)/ ).first
49
65
  if lead.size == 0
50
- if options[:rid]
51
- category = primary = RIDCategory.new( rest )
52
- else
53
- category = primary = LIWCCategory.new( rest )
54
- end
55
-
66
+ category = primary = cat_class.new(rest)
56
67
  secondary, tertiary = nil
57
68
  elsif lead.size == 1
58
- if options[:rid]
59
- category = secondary = RIDCategory.new( rest, primary )
60
- else
61
- category = secondary = LIWCCategory.new(rest,primary)
62
- end
69
+ category = secondary = cat_class.new(rest, primary )
63
70
  tertiary = nil
64
71
  elsif lead.size == 2 && ( cat = line.strip.index(/^[A-ZĄŚĘĆŃŹŻŁÓ_]+$/)) && cat >= 0
65
- if options[:rid]
66
-
67
- category = tertiary = RIDCategory.new( rest, secondary )
68
- else
69
- category = tertiary = LIWCCategory.new( rest, secondary )
70
- end
72
+ category = tertiary = cat_class.new( rest, secondary )
71
73
  else
72
74
  word = rest.downcase.gsub( /\s*\(1\)$/, '' )
73
75
  @tree.insert( word, category )
data/lib/emoticon.rb CHANGED
@@ -1,14 +1,14 @@
1
1
  require 'meaningable'
2
- module NLP
3
- class Emoticon < Token
4
- include Meaningable
5
-
6
- def initialize(tokens,tags)
7
- @orth = tokens.join("")
8
- @tags = 'emoticon'
9
- end
10
2
 
3
+ module NLP
4
+ class Emoticon < Token
5
+ include Meaningable
11
6
 
7
+ def initialize(tokens,tags)
8
+ @orth = tokens.join("")
9
+ @tags = 'emoticon'
12
10
  end
11
+
12
+ end
13
13
  end
14
14
 
data/lib/inflectable.rb CHANGED
@@ -1,61 +1,60 @@
1
1
  module Inflectable
2
-
3
- GRAM_CAT = {
4
- #rzeczownik
5
- :adj => 'przymiotnik',
6
- [:subst,:depr] => 'rzeczownik',
7
- :adv => 'przyslowek',
8
- :num => 'liczebnik',
9
- [:pron,:siebie] => 'zaimek',
10
- :prep => 'przyimek',
11
- #liczby
12
- :sg => 'liczba_pojedyncza',
13
- :pl => 'liczba_mnoga',
14
-
15
- #Przypadki
16
- :nom => 'mianownik',
17
- :gen => 'dopelniacz',
18
- :dat => 'celownik',
19
- :acc => 'biernik',
20
- :inst => 'narzednik',
21
- :loc => 'miejscownik',
22
- :voc => 'wolacz',
23
-
24
- #Rodzaje
25
- :m1 => 'meski_osobowy',
26
- :m2 => 'meski_zwierzecy',
27
- :m3 => 'meski_rzeczowy',
28
- :f => 'zenski',
29
- :n1 => 'nijaki_zbiorowy',
30
- :n2 => 'nijaki zwykly',
31
- :p1 => 'przymnogi_osobowy',
32
- :p2 => 'przymnogi_zwykly',
33
- :p3 => 'przymnogi_opisowy',
34
-
35
- #Osoby
36
- :pri => "pierwsza_osoba",
37
- :sec => "druga_osoba",
38
- :ter => "trzecia_osoba",
39
-
40
- #Stopień
41
- :pos => "stopien_rowny",
42
- :comp => "stopien_wyzszy",
43
- :sup => "stopien_najwyzszy"
44
- }
45
-
46
- GRAM_CAT.each do |key,value|
47
-
48
- define_method(value+"?"){
49
- inflection.split(":").any?{|e|
50
- if key.is_a? Array
51
- key.any?{|k| e.include? k.to_s}
52
- else
53
- e.include? key.to_s
54
- end
55
- }
56
- }
57
- end
58
-
59
-
60
-
2
+
3
+ GRAM_CAT = {
4
+ #rzeczownik
5
+ :adj => 'przymiotnik',
6
+ [:subst,:depr] => 'rzeczownik',
7
+ :adv => 'przyslowek',
8
+ :num => 'liczebnik',
9
+ [:pron,:siebie] => 'zaimek',
10
+ :prep => 'przyimek',
11
+ #liczby
12
+ :sg => 'liczba_pojedyncza',
13
+ :pl => 'liczba_mnoga',
14
+
15
+ #Przypadki
16
+ :nom => 'mianownik',
17
+ :gen => 'dopelniacz',
18
+ :dat => 'celownik',
19
+ :acc => 'biernik',
20
+ :inst => 'narzednik',
21
+ :loc => 'miejscownik',
22
+ :voc => 'wolacz',
23
+
24
+ #Rodzaje
25
+ :m1 => 'meski_osobowy',
26
+ :m2 => 'meski_zwierzecy',
27
+ :m3 => 'meski_rzeczowy',
28
+ :f => 'zenski',
29
+ :n1 => 'nijaki_zbiorowy',
30
+ :n2 => 'nijaki zwykly',
31
+ :p1 => 'przymnogi_osobowy',
32
+ :p2 => 'przymnogi_zwykly',
33
+ :p3 => 'przymnogi_opisowy',
34
+
35
+ #Osoby
36
+ :pri => "pierwsza_osoba",
37
+ :sec => "druga_osoba",
38
+ :ter => "trzecia_osoba",
39
+
40
+ #Stopień
41
+ :pos => "stopien_rowny",
42
+ :comp => "stopien_wyzszy",
43
+ :sup => "stopien_najwyzszy"
44
+ }
45
+
46
+ GRAM_CAT.each do |key,value|
47
+
48
+ define_method(value+"?"){
49
+ inflection.split(":").any?{|e|
50
+ if key.is_a? Array
51
+ key.any?{|k| e.include? k.to_s}
52
+ else
53
+ e.include? key.to_s
54
+ end
55
+ }
56
+ }
57
+ end
58
+
59
+
61
60
  end
data/lib/lemmatizer.rb CHANGED
@@ -3,106 +3,110 @@ require 'rexml/document'
3
3
  require 'morfeusz'
4
4
 
5
5
  module NLP
6
- class Lemmatizer
6
+ class Lemmatizer
7
+
7
8
  include REXML
8
-
9
- def self.lematize(text,method,input_type)
10
- if text.is_a? File
11
- str = text.read
12
- text.close
13
- elsif text.is_a? String
14
- str = text
15
- else
16
- raise ArgumentError, "Argument is not String or File"
17
- end
18
-
19
- if method === :takipi
20
- takipi_lematize(str,input_type)
21
- #Default lematization method is Morfeusz
22
- else
23
- morfeusz_lematize(str)
24
- end
25
9
 
10
+ def self.lemmatize(text,method=nil,input_type=nil)
11
+ if text.is_a? File
12
+ str = text.read
13
+ text.close
14
+ elsif text.is_a? String
15
+ str = text
16
+ else
17
+ raise ArgumentError, "Argument is not String or File"
18
+ end
19
+
20
+ if method === :takipi
21
+ takipi_lemmatize(str,input_type)
22
+
23
+ #Default lematization method is Morfeusz
24
+ else
25
+ morfeusz_lemmatize(str)
26
+ end
26
27
  end
27
28
 
28
29
 
29
30
 
30
- def self.takipi_lematize(text,method)
31
-
32
- if method === :local
31
+ def self.takipi_lemmatize(text,method)
33
32
 
34
- t1 = Thread.new do
35
- `takipi -i text.txt -o output.xml -it TXT`
36
- end
33
+ if method === :local
37
34
 
38
- t1.join
35
+ xml_file = TAKIPI_XML_FILE
39
36
 
40
- f = File.open("output.xml","r")
41
- doc = Document.new f
42
- elsif method === :remote
43
- xml = TakipiWebService.request(text)
44
- doc = Document.new xml
45
- else
46
- raise ArgumentError, 'Argument is not :local or :remote'
37
+ t1 = Thread.new do
38
+ `echo '#{text}' > /tmp/text.txt; takipi -i /tmp/text.txt -o #{xml_file} -it TXT`
47
39
  end
48
40
 
49
- parse_lematized_xml(doc)
41
+ t1.join
42
+
43
+ f = File.open(xml_file,"r")
44
+ doc = Document.new f
45
+
46
+ elsif method === :remote
47
+ xml = TakipiWebService.request(text)
48
+ doc = Document.new xml
49
+ else
50
+ raise ArgumentError, 'Argument is not :local or :remote'
51
+ end
52
+
53
+ parse_lemmatized_xml(doc)
50
54
  end
51
55
 
52
56
 
53
- def self.morfeusz_lematize(text)
54
- temp_text = []
55
-
56
- #simple tagger
57
- #TODO lematizer should take block or object Tagger that defines
58
- #how split string
59
- text.split(/\.|!|\?/).each do |s|
60
- sentence = Sentence.new
61
- sentence << s.split(" ").collect{ |t|
62
- if word = Morfeusz::Lexeme.find(t)
63
- if word[0]
64
- Word.new(t,word[0].base_form,"")
65
- else
66
- Word.new(t,"","")
67
- end
68
- else
69
- Word.new(t,"","")
70
- end
71
- }
72
- temp_text.push sentence
73
- end
74
- temp_text
75
- end
76
-
77
-
78
- def self.parse_lematized_xml(doc)
79
-
80
- text = Text.new
81
-
82
- doc.elements.each("*/chunkList/chunk") do |chunk|
83
- sentence = Sentence.new
84
- tokens = []
85
-
86
- chunk.elements.each("tok") do |tok|
87
- word = tok.elements[1].text
88
- lemat, inflect = ""
89
-
90
- tok.elements.each("lex") do |lex|
91
- if lex.has_attributes?
92
- lemat = lex.elements[1].text
93
- inflect = lex.elements[2].text
94
- end
95
- end
96
-
97
- tokens << Word.new(word,lemat,inflect)
57
+ def self.morfeusz_lemmatize(text)
58
+ temp_text = Text.new
59
+
60
+ #simple tagger
61
+ #TODO lemmatizer should take TokenScanner object that defines
62
+ #how split string
63
+ text.split(/\.|!|\?/).each do |s|
64
+ sentence = Sentence.new
65
+ sentence << s.split(" ").collect{ |t|
66
+ if word = Morfeusz::Lexeme.find(t)
67
+ if word[0]
68
+ Word.new(t,word[0].base_form,"")
69
+ else
70
+ Word.new(t,"","")
98
71
  end
72
+ else
73
+ Word.new(t,"","")
74
+ end
75
+ }
76
+ temp_text << sentence
77
+ end
78
+ temp_text
79
+ end
80
+
99
81
 
100
- sentence << tokens
101
- text << sentence
82
+ def self.parse_lemmatized_xml(doc)
83
+
84
+ text = Text.new
85
+
86
+ doc.elements.each("*/chunkList/chunk") do |chunk|
87
+ sentence = Sentence.new
88
+ tokens = []
89
+
90
+ chunk.elements.each("tok") do |tok|
91
+ word = tok.elements[1].text
92
+ lemat, inflect = ""
93
+
94
+ tok.elements.each("lex") do |lex|
95
+ if lex.has_attributes?
96
+ lemat = lex.elements[1].text
97
+ inflect = lex.elements[2].text
98
+ end
99
+ end
100
+
101
+ tokens << Word.new(word,lemat,inflect)
102
102
  end
103
- text
103
+
104
+ sentence << tokens
105
+ text << sentence
106
+ end
107
+ text
104
108
  end
105
109
 
106
110
 
107
- end
111
+ end
108
112
  end