nlp 0.2.5 → 0.2.6

Sign up to get free protection for your applications and to get access to all the features.
data/lib/analyzer.rb CHANGED
@@ -9,72 +9,40 @@ require 'sentence'
9
9
  require "token_scanner.rb"
10
10
  require "lemmatizer"
11
11
  require 'jcode'
12
+ require 'statistic'
12
13
  $KODE = "UTF8"
13
14
 
14
15
  module NLP
15
16
 
16
17
  class Analyzer
17
18
 
18
- CACHE_DIR = '~/'
19
-
20
- def initialize( category_file, restore = true )
21
- state_file = File.expand_path(Analyzer::CACHE_DIR)
22
- if restore
23
- @dictionary = Dictionary.restore(state_file)
24
- else
25
- @dictionary = Dictionary.new
26
- @dictionary.load_categories( category_file, :rid => true )
27
- @dictionary.store(state_file)
28
- end
29
-
19
+ def initialize(dict)
20
+ @dictionary = Dictionary.new(dict)
30
21
  end
31
-
32
-
33
- def analyze( scanner)
34
-
35
- results = {
36
- :word_count => 0,
37
- :word_total => 0,
38
- :scores => Hash.new { 0 },
39
- :words => []
40
- }
41
22
 
42
23
 
24
+ def analyze(scanner)
25
+
26
+ results = Statistic.new
43
27
 
44
- while token = scanner.current
28
+ while token = scanner.current
45
29
  word = token.lemat
46
30
 
47
- categories = @dictionary.find( word.gsub( /[^\w-]/, "" ) )
31
+ categories = @dictionary.find(word.gsub(/[^\w-]/, "" ))
48
32
  unless categories.nil?
49
- categories.each do |category|
50
- puts "Znalazłem słowo #{word} : #{category}"
51
- results[:scores][category] = results[:scores][category] + 1
52
- results[:word_count] += 1
53
- results[:words].push word
54
- end
55
-
56
-
33
+ categories.each do |category|
34
+ puts "Znalazłem słowo #{word} : #{category}"
35
+ results.add(word,category)
36
+ end
57
37
  end
58
38
 
59
- results[:word_total] += 1
39
+ results.total_words += 1
60
40
  scanner.next(:word)
61
- end
62
-
63
- results[:sorted_scores] = results[:scores].to_a.sort_by { |result| -result[1] }
64
- primary_sum = results[:sorted_scores].select { |result| result[0].primary? }.inject( 0 ) { |count,result| count + result[1] }
65
- secondary_sum = results[:sorted_scores].select { |result| result[0].secondary? }.inject( 0 ) { |count,result| count + result[1] }
66
- emotion_sum = results[:sorted_scores].select { |result| result[0].emotions? }.inject( 0 ) { |count,result| count + result[1] }
67
-
41
+ end
68
42
 
69
- results[:classes] = {
70
- :primary => Float(primary_sum) / results[:word_count],
71
- :secondary => Float(secondary_sum) / results[:word_count],
72
- :emotions => Float(emotion_sum) / results[:word_count]
73
- }
74
-
75
43
  results
76
- end
77
44
 
45
+ end
78
46
  end
79
47
  end
80
48
 
data/lib/category.rb CHANGED
@@ -1,16 +1,16 @@
1
1
  module NLP
2
2
  class Category
3
3
  attr_reader :parent, :name
4
-
5
- def initialize( name, parent = nil )
4
+
5
+ def initialize(name, parent = nil)
6
6
  @parent = parent
7
7
  @name = name.to_sym
8
8
  end
9
-
9
+
10
10
  def path
11
- @parent ? ( @parent.path + '/' + name.to_s ) : name.to_s
11
+ @parent ? (@parent.path + '/' + name.to_s) : name.to_s
12
12
  end
13
-
13
+
14
14
  def root
15
15
  category = self
16
16
  while category.parent != nil
@@ -18,11 +18,10 @@ module NLP
18
18
  end
19
19
  category.name
20
20
  end
21
-
21
+
22
22
  def to_s
23
23
  "#{path.inspect}"
24
24
  end
25
-
26
-
25
+
27
26
  end
28
27
  end
data/lib/dictionary.rb CHANGED
@@ -5,12 +5,24 @@ require 'rid_category'
5
5
  require 'liwc_category'
6
6
 
7
7
  module NLP
8
+
8
9
  class Dictionary
9
- def initialize
10
- @tree = SearchTree.new
11
- @categories = {}
10
+ attr_accessor :tree
11
+
12
+
13
+ def initialize(category_file=:rid,restore = true)
14
+ state_file = File.expand_path(DICTIONARY_CACHE_DIR+".#{category_file.to_s}")
15
+ if restore and File.exist?(state_file)
16
+ d = Dictionary.restore(state_file)
17
+ @tree = d.tree
18
+ else
19
+ @tree = SearchTree.new
20
+ load_categories(File.dirname(__FILE__)+"/../dict/#{category_file.to_s}", category_file )
21
+ store(state_file)
22
+ end
23
+
12
24
  end
13
-
25
+
14
26
  def store( state_file )
15
27
  File.open( File.expand_path( state_file ), "w" ) do |file|
16
28
  Marshal.dump( self, file )
@@ -24,50 +36,40 @@ module NLP
24
36
  end
25
37
  end
26
38
 
27
-
28
-
29
- def find( word )
39
+ def find(word)
30
40
  if @exception_pattern && @exception_pattern =~ word
31
41
  nil
32
42
  else
33
- @tree.find( word )
43
+ @tree.find(word)
34
44
  end
35
45
  end
36
-
37
46
 
38
- def load_categories( category_file,options )
47
+
48
+ def load_categories(category_file,type)
39
49
  category = nil
40
50
  primary = nil
41
51
  secondary = nil
42
52
  tertiary = nil
43
-
53
+
54
+ if type == :rid
55
+ cat_class = NLP.const_get("RIDCategory")
56
+ else
57
+ cat_class = NLP.const_get("LIWCCategory")
58
+ end
59
+
44
60
  File.open( category_file ) do |file|
45
61
  while line = file.gets
46
62
  line.chomp!
47
63
  begin
48
64
  lead, rest = line.scan( /(\t*)(.*)/ ).first
49
65
  if lead.size == 0
50
- if options[:rid]
51
- category = primary = RIDCategory.new( rest )
52
- else
53
- category = primary = LIWCCategory.new( rest )
54
- end
55
-
66
+ category = primary = cat_class.new(rest)
56
67
  secondary, tertiary = nil
57
68
  elsif lead.size == 1
58
- if options[:rid]
59
- category = secondary = RIDCategory.new( rest, primary )
60
- else
61
- category = secondary = LIWCCategory.new(rest,primary)
62
- end
69
+ category = secondary = cat_class.new(rest, primary )
63
70
  tertiary = nil
64
71
  elsif lead.size == 2 && ( cat = line.strip.index(/^[A-ZĄŚĘĆŃŹŻŁÓ_]+$/)) && cat >= 0
65
- if options[:rid]
66
-
67
- category = tertiary = RIDCategory.new( rest, secondary )
68
- else
69
- category = tertiary = LIWCCategory.new( rest, secondary )
70
- end
72
+ category = tertiary = cat_class.new( rest, secondary )
71
73
  else
72
74
  word = rest.downcase.gsub( /\s*\(1\)$/, '' )
73
75
  @tree.insert( word, category )
data/lib/emoticon.rb CHANGED
@@ -1,14 +1,14 @@
1
1
  require 'meaningable'
2
- module NLP
3
- class Emoticon < Token
4
- include Meaningable
5
-
6
- def initialize(tokens,tags)
7
- @orth = tokens.join("")
8
- @tags = 'emoticon'
9
- end
10
2
 
3
+ module NLP
4
+ class Emoticon < Token
5
+ include Meaningable
11
6
 
7
+ def initialize(tokens,tags)
8
+ @orth = tokens.join("")
9
+ @tags = 'emoticon'
12
10
  end
11
+
12
+ end
13
13
  end
14
14
 
data/lib/inflectable.rb CHANGED
@@ -1,61 +1,60 @@
1
1
  module Inflectable
2
-
3
- GRAM_CAT = {
4
- #rzeczownik
5
- :adj => 'przymiotnik',
6
- [:subst,:depr] => 'rzeczownik',
7
- :adv => 'przyslowek',
8
- :num => 'liczebnik',
9
- [:pron,:siebie] => 'zaimek',
10
- :prep => 'przyimek',
11
- #liczby
12
- :sg => 'liczba_pojedyncza',
13
- :pl => 'liczba_mnoga',
14
-
15
- #Przypadki
16
- :nom => 'mianownik',
17
- :gen => 'dopelniacz',
18
- :dat => 'celownik',
19
- :acc => 'biernik',
20
- :inst => 'narzednik',
21
- :loc => 'miejscownik',
22
- :voc => 'wolacz',
23
-
24
- #Rodzaje
25
- :m1 => 'meski_osobowy',
26
- :m2 => 'meski_zwierzecy',
27
- :m3 => 'meski_rzeczowy',
28
- :f => 'zenski',
29
- :n1 => 'nijaki_zbiorowy',
30
- :n2 => 'nijaki zwykly',
31
- :p1 => 'przymnogi_osobowy',
32
- :p2 => 'przymnogi_zwykly',
33
- :p3 => 'przymnogi_opisowy',
34
-
35
- #Osoby
36
- :pri => "pierwsza_osoba",
37
- :sec => "druga_osoba",
38
- :ter => "trzecia_osoba",
39
-
40
- #Stopień
41
- :pos => "stopien_rowny",
42
- :comp => "stopien_wyzszy",
43
- :sup => "stopien_najwyzszy"
44
- }
45
-
46
- GRAM_CAT.each do |key,value|
47
-
48
- define_method(value+"?"){
49
- inflection.split(":").any?{|e|
50
- if key.is_a? Array
51
- key.any?{|k| e.include? k.to_s}
52
- else
53
- e.include? key.to_s
54
- end
55
- }
56
- }
57
- end
58
-
59
-
60
-
2
+
3
+ GRAM_CAT = {
4
+ #rzeczownik
5
+ :adj => 'przymiotnik',
6
+ [:subst,:depr] => 'rzeczownik',
7
+ :adv => 'przyslowek',
8
+ :num => 'liczebnik',
9
+ [:pron,:siebie] => 'zaimek',
10
+ :prep => 'przyimek',
11
+ #liczby
12
+ :sg => 'liczba_pojedyncza',
13
+ :pl => 'liczba_mnoga',
14
+
15
+ #Przypadki
16
+ :nom => 'mianownik',
17
+ :gen => 'dopelniacz',
18
+ :dat => 'celownik',
19
+ :acc => 'biernik',
20
+ :inst => 'narzednik',
21
+ :loc => 'miejscownik',
22
+ :voc => 'wolacz',
23
+
24
+ #Rodzaje
25
+ :m1 => 'meski_osobowy',
26
+ :m2 => 'meski_zwierzecy',
27
+ :m3 => 'meski_rzeczowy',
28
+ :f => 'zenski',
29
+ :n1 => 'nijaki_zbiorowy',
30
+ :n2 => 'nijaki zwykly',
31
+ :p1 => 'przymnogi_osobowy',
32
+ :p2 => 'przymnogi_zwykly',
33
+ :p3 => 'przymnogi_opisowy',
34
+
35
+ #Osoby
36
+ :pri => "pierwsza_osoba",
37
+ :sec => "druga_osoba",
38
+ :ter => "trzecia_osoba",
39
+
40
+ #Stopień
41
+ :pos => "stopien_rowny",
42
+ :comp => "stopien_wyzszy",
43
+ :sup => "stopien_najwyzszy"
44
+ }
45
+
46
+ GRAM_CAT.each do |key,value|
47
+
48
+ define_method(value+"?"){
49
+ inflection.split(":").any?{|e|
50
+ if key.is_a? Array
51
+ key.any?{|k| e.include? k.to_s}
52
+ else
53
+ e.include? key.to_s
54
+ end
55
+ }
56
+ }
57
+ end
58
+
59
+
61
60
  end
data/lib/lemmatizer.rb CHANGED
@@ -3,106 +3,110 @@ require 'rexml/document'
3
3
  require 'morfeusz'
4
4
 
5
5
  module NLP
6
- class Lemmatizer
6
+ class Lemmatizer
7
+
7
8
  include REXML
8
-
9
- def self.lematize(text,method,input_type)
10
- if text.is_a? File
11
- str = text.read
12
- text.close
13
- elsif text.is_a? String
14
- str = text
15
- else
16
- raise ArgumentError, "Argument is not String or File"
17
- end
18
-
19
- if method === :takipi
20
- takipi_lematize(str,input_type)
21
- #Default lematization method is Morfeusz
22
- else
23
- morfeusz_lematize(str)
24
- end
25
9
 
10
+ def self.lemmatize(text,method=nil,input_type=nil)
11
+ if text.is_a? File
12
+ str = text.read
13
+ text.close
14
+ elsif text.is_a? String
15
+ str = text
16
+ else
17
+ raise ArgumentError, "Argument is not String or File"
18
+ end
19
+
20
+ if method === :takipi
21
+ takipi_lemmatize(str,input_type)
22
+
23
+ #Default lematization method is Morfeusz
24
+ else
25
+ morfeusz_lemmatize(str)
26
+ end
26
27
  end
27
28
 
28
29
 
29
30
 
30
- def self.takipi_lematize(text,method)
31
-
32
- if method === :local
31
+ def self.takipi_lemmatize(text,method)
33
32
 
34
- t1 = Thread.new do
35
- `takipi -i text.txt -o output.xml -it TXT`
36
- end
33
+ if method === :local
37
34
 
38
- t1.join
35
+ xml_file = TAKIPI_XML_FILE
39
36
 
40
- f = File.open("output.xml","r")
41
- doc = Document.new f
42
- elsif method === :remote
43
- xml = TakipiWebService.request(text)
44
- doc = Document.new xml
45
- else
46
- raise ArgumentError, 'Argument is not :local or :remote'
37
+ t1 = Thread.new do
38
+ `echo '#{text}' > /tmp/text.txt; takipi -i /tmp/text.txt -o #{xml_file} -it TXT`
47
39
  end
48
40
 
49
- parse_lematized_xml(doc)
41
+ t1.join
42
+
43
+ f = File.open(xml_file,"r")
44
+ doc = Document.new f
45
+
46
+ elsif method === :remote
47
+ xml = TakipiWebService.request(text)
48
+ doc = Document.new xml
49
+ else
50
+ raise ArgumentError, 'Argument is not :local or :remote'
51
+ end
52
+
53
+ parse_lemmatized_xml(doc)
50
54
  end
51
55
 
52
56
 
53
- def self.morfeusz_lematize(text)
54
- temp_text = []
55
-
56
- #simple tagger
57
- #TODO lematizer should take block or object Tagger that defines
58
- #how split string
59
- text.split(/\.|!|\?/).each do |s|
60
- sentence = Sentence.new
61
- sentence << s.split(" ").collect{ |t|
62
- if word = Morfeusz::Lexeme.find(t)
63
- if word[0]
64
- Word.new(t,word[0].base_form,"")
65
- else
66
- Word.new(t,"","")
67
- end
68
- else
69
- Word.new(t,"","")
70
- end
71
- }
72
- temp_text.push sentence
73
- end
74
- temp_text
75
- end
76
-
77
-
78
- def self.parse_lematized_xml(doc)
79
-
80
- text = Text.new
81
-
82
- doc.elements.each("*/chunkList/chunk") do |chunk|
83
- sentence = Sentence.new
84
- tokens = []
85
-
86
- chunk.elements.each("tok") do |tok|
87
- word = tok.elements[1].text
88
- lemat, inflect = ""
89
-
90
- tok.elements.each("lex") do |lex|
91
- if lex.has_attributes?
92
- lemat = lex.elements[1].text
93
- inflect = lex.elements[2].text
94
- end
95
- end
96
-
97
- tokens << Word.new(word,lemat,inflect)
57
+ def self.morfeusz_lemmatize(text)
58
+ temp_text = Text.new
59
+
60
+ #simple tagger
61
+ #TODO lemmatizer should take TokenScanner object that defines
62
+ #how split string
63
+ text.split(/\.|!|\?/).each do |s|
64
+ sentence = Sentence.new
65
+ sentence << s.split(" ").collect{ |t|
66
+ if word = Morfeusz::Lexeme.find(t)
67
+ if word[0]
68
+ Word.new(t,word[0].base_form,"")
69
+ else
70
+ Word.new(t,"","")
98
71
  end
72
+ else
73
+ Word.new(t,"","")
74
+ end
75
+ }
76
+ temp_text << sentence
77
+ end
78
+ temp_text
79
+ end
80
+
99
81
 
100
- sentence << tokens
101
- text << sentence
82
+ def self.parse_lemmatized_xml(doc)
83
+
84
+ text = Text.new
85
+
86
+ doc.elements.each("*/chunkList/chunk") do |chunk|
87
+ sentence = Sentence.new
88
+ tokens = []
89
+
90
+ chunk.elements.each("tok") do |tok|
91
+ word = tok.elements[1].text
92
+ lemat, inflect = ""
93
+
94
+ tok.elements.each("lex") do |lex|
95
+ if lex.has_attributes?
96
+ lemat = lex.elements[1].text
97
+ inflect = lex.elements[2].text
98
+ end
99
+ end
100
+
101
+ tokens << Word.new(word,lemat,inflect)
102
102
  end
103
- text
103
+
104
+ sentence << tokens
105
+ text << sentence
106
+ end
107
+ text
104
108
  end
105
109
 
106
110
 
107
- end
111
+ end
108
112
  end