nlp 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/analyzer.rb ADDED
@@ -0,0 +1,72 @@
1
+ require 'dictionary'
2
+ require 'morfeusz'
3
+ require 'token'
4
+ require 'word'
5
+ require 'emoticon'
6
+ require 'sentence'
7
+ require "token_scanner.rb"
8
+ require "inflectable"
9
+ require "meaningable"
10
+
11
+ $KODE = "UTF8"
12
+
13
+ module NLP
14
+
15
+ class Analyzer
16
+
17
+ include REXML
18
+ #Lexeme = Apohllo::Morfeusz::Lexeme
19
+
20
+ def initialize( category_file, restore = true )
21
+ state_file = File.expand_path(Dictionary::CACHE_DIR)
22
+ if restore
23
+ @dictionary = Dictionary.restore(state_file)
24
+ else
25
+ @dictionary = Dictionary.new
26
+ @dictionary.load_categories( category_file )
27
+ @dictionary.store(state_file)
28
+ end
29
+
30
+ end
31
+
32
+
33
+ def analyze( scanner)
34
+
35
+ results = {
36
+ :word_count => 0,
37
+ :word_total => 0,
38
+ :scores => Hash.new { 0 },
39
+ :words => []
40
+ }
41
+
42
+ while token = scanner.current
43
+ word = token.lemat
44
+
45
+ categories = @dictionary.find( word.gsub( /[^\w-]/, "" ) )
46
+ unless categories.nil?
47
+ categories.each do |category|
48
+
49
+ puts "#{word} : #{category.name}"
50
+ results[:scores][category] = results[:scores][category] + 1
51
+ end
52
+
53
+ results[:word_count] += 1
54
+ results[:words].push word
55
+ end
56
+
57
+ results[:word_total] += 1
58
+ scanner.next(:word)
59
+ end
60
+
61
+ results[:sorted_scores] = results[:scores].to_a.sort_by { |result| -result[1] }
62
+ results[:classes] = {
63
+ :primary => Float(results[:sorted_scores].select { |result| result[0].primary? }.inject( 0 ) { |count,result| count + result[1] }) / results[:word_count],
64
+ :secondary => Float(results[:sorted_scores].select { |result| result[0].secondary? }.inject( 0 ) { |count,result| count + result[1] }) / results[:word_count],
65
+ :emotions => Float(results[:sorted_scores].select { |result| result[0].emotions? }.inject( 0 ) { |count,result| count + result[1] }) / results[:word_count]
66
+ }
67
+
68
+ results
69
+ end
70
+
71
+ end
72
+ end
data/lib/category.rb ADDED
@@ -0,0 +1,28 @@
1
+ module NLP
2
+ class Category
3
+ attr_reader :parent, :name
4
+
5
+ def initialize( name, parent = nil )
6
+ @parent = parent
7
+ @name = name.to_sym
8
+ end
9
+
10
+ def path
11
+ @parent ? ( @parent.path + '/' + name.to_s ) : name.to_s
12
+ end
13
+
14
+ def root
15
+ category = self
16
+ while category.parent != nil
17
+ category = category.parent
18
+ end
19
+ category.name
20
+ end
21
+
22
+ def to_s
23
+ "#{path.inspect}"
24
+ end
25
+
26
+
27
+ end
28
+ end
data/lib/dictionary.rb ADDED
@@ -0,0 +1,70 @@
1
+
2
+ require 'stree'
3
+ require 'category'
4
+ require 'rid_category'
5
+
6
+ module NLP
7
+ class Dictionary
8
+ CACHE_DIR = '~/.rima'
9
+ def initialize
10
+ @tree = SearchTree.new
11
+ @categories = {}
12
+ end
13
+
14
+ def store( state_file )
15
+ File.open( File.expand_path( state_file ), "w" ) do |file|
16
+ Marshal.dump( self, file )
17
+ end
18
+ self
19
+ end
20
+
21
+ def self.restore( state_file )
22
+ File.open( File.expand_path( state_file ) ) do |file|
23
+ Marshal.restore( file )
24
+ end
25
+ end
26
+
27
+
28
+
29
+ def find( word )
30
+ if @exception_pattern && @exception_pattern =~ word
31
+ nil
32
+ else
33
+ @tree.find( word )
34
+ end
35
+ end
36
+
37
+
38
+ def load_categories( category_file )
39
+ category = nil
40
+ primary = nil
41
+ secondary = nil
42
+ tertiary = nil
43
+
44
+ File.open( category_file ) do |file|
45
+ while line = file.gets
46
+ line.chomp!
47
+ begin
48
+ lead, rest = line.scan( /(\t*)(.*)/ ).first
49
+ if lead.size == 0
50
+ category = primary = RIDCategory.new( rest )
51
+ secondary, tertiary = nil
52
+ elsif lead.size == 1
53
+ category = secondary = RIDCategory.new( rest, primary )
54
+ tertiary = nil
55
+ elsif lead.size == 2 && ( cat = line.strip.index(/^[A-ZĄŚĘĆŃŹŻŁÓ]+$/)) && cat >= 0
56
+ category = tertiary = RIDCategory.new( rest, secondary )
57
+ else
58
+ word = rest.downcase.gsub( /\s*\(1\)$/, '' )
59
+ @tree.insert( word, category )
60
+ end
61
+ rescue
62
+ puts "Error for line: #{line}"
63
+ raise
64
+ end
65
+ end
66
+ end
67
+ end
68
+ end
69
+ end
70
+
data/lib/emoticon.rb ADDED
@@ -0,0 +1,13 @@
1
+ module NLP
2
+ class Emoticon < Token
3
+ include Meaningable
4
+
5
+ def initialize(tokens,tags)
6
+ @orth = tokens.join("")
7
+ @tags = 'emoticon'
8
+ end
9
+
10
+
11
+ end
12
+ end
13
+
@@ -0,0 +1,59 @@
1
+ module Inflectable
2
+
3
+ GRAM_CAT = {
4
+ #rzeczownik
5
+ [:subst, :depr] => 'rzeczownik',
6
+ :adj => 'przymiotnik',
7
+ :adv => 'przyslowek',
8
+ :num => 'liczebnik',
9
+ [:pron,:siebie] => 'zaimek',
10
+ :prep => 'przyimek',
11
+ #liczby
12
+ :sg => 'liczba_pojedyncza',
13
+ :pl => 'liczba_mnoga',
14
+
15
+ #Przypadki
16
+ :nom => 'mianownik',
17
+ :gen => 'dopelniacz',
18
+ :dat => 'celownik',
19
+ :acc => 'biernik',
20
+ :inst => 'narzednik',
21
+ :loc => 'miejscownik',
22
+ :voc => 'wolacz',
23
+
24
+ #Rodzaje
25
+ :m1 => 'męski_osobowy',
26
+ :m2 => 'męski_zwierzęcy',
27
+ :m3 => 'męski_rzeczowy',
28
+ :f => 'żeński',
29
+ :n1 => 'nijaki zbiorowy',
30
+ :n2 => 'nijaki zwykły',
31
+ :p1 => 'przymnogi osobowy',
32
+ :p2 => 'przymnogi zwykły',
33
+ :p3 => 'przymnogi opisowy',
34
+
35
+ #Osoby
36
+ :pri => "pierwsza_osoba",
37
+ :sec => "druga_osoba",
38
+ :ter => "trzecia_osoba",
39
+
40
+ #Stopień
41
+ :pos => "stopien_rowny",
42
+ :comp => "stopien_wyzszy",
43
+ :sup => "stopien_najwyzszy"
44
+ }
45
+
46
+ GRAM_CAT.each do |key,value|
47
+ if key.kind_of? Array
48
+ key = key.first
49
+ else
50
+ define_method(value+"?"){
51
+ inflection.split(":").any?{|e| e.include? key.to_s[1..-1]}
52
+ }
53
+ end
54
+ end
55
+
56
+
57
+
58
+
59
+ end
@@ -0,0 +1,7 @@
1
+ module NLP
2
+
3
+ class LIWCCategory < Category
4
+
5
+
6
+ end
7
+ end
@@ -0,0 +1,55 @@
1
+ module Meaningable
2
+
3
+ #LIWC
4
+ def positive_emotion?
5
+
6
+ end
7
+
8
+ def negative_emotion?
9
+
10
+ end
11
+
12
+ def emotion?
13
+
14
+ end
15
+
16
+ def cognitive?
17
+ end
18
+
19
+ def social?
20
+
21
+ end
22
+
23
+ #EXPERIMENTAl
24
+
25
+ def bad_word?
26
+
27
+ end
28
+
29
+
30
+ def emoticon?
31
+
32
+ end
33
+
34
+
35
+ def filler?
36
+
37
+ end
38
+
39
+ def nonfluent?
40
+
41
+ end
42
+
43
+ #SEMANTIC
44
+ def synonym?(other)
45
+
46
+ end
47
+
48
+ def synonyms
49
+
50
+ end
51
+
52
+
53
+
54
+
55
+ end
data/lib/morfeusz.rb ADDED
@@ -0,0 +1,69 @@
1
+ # Ruby bindings for Morfeusz v. 0.1
2
+ # Author: Aleksander Pohl
3
+ # apohllo@o2.pl
4
+
5
+ require 'rubygems'
6
+ require 'inline'
7
+ require 'singleton'
8
+ require 'iconv'
9
+ module NLP
10
+ module Morfeusz
11
+ MORFOPT_ENCODING = 1
12
+ MORFEUSZ_UTF_8 = 8
13
+ class Morfeusz
14
+ include Singleton
15
+
16
+ inline(:C) do |builder|
17
+ builder.include '"morfeusz.h"'
18
+ builder.add_compile_flags '-lmorfeusz', '-I/home/knife/morf/include/'
19
+ builder.c <<-END
20
+ void initialize(){
21
+ morfeusz_set_option(#{MORFOPT_ENCODING},#{MORFEUSZ_UTF_8});
22
+ }
23
+ END
24
+
25
+ builder.c <<-END
26
+ char * about(){
27
+ return morfeusz_about();
28
+ }
29
+ END
30
+
31
+ builder.c <<-END
32
+ VALUE _base(VALUE str){
33
+ char * p;
34
+ int index = 0;
35
+ VALUE arr = rb_ary_new();
36
+ int id_push = rb_intern("push");
37
+ p = StringValuePtr(str);
38
+ InterpMorf* result = morfeusz_analyse(p);
39
+ InterpMorf el;
40
+ while((el = result[index++]).k != -1){
41
+ if(el.haslo != NULL){
42
+ rb_funcall(arr,id_push,1,rb_str_new2(el.haslo));
43
+ }
44
+ }
45
+ return arr;
46
+ }
47
+ END
48
+
49
+ def base(word)
50
+ # _base(word)
51
+ _base(word).collect{|e| e}
52
+ end
53
+
54
+ end
55
+ end
56
+
57
+ class Lexeme
58
+ attr_reader :base_form
59
+ def initialize(base_form)
60
+ @base_form = base_form
61
+ end
62
+
63
+ def self.find(word)
64
+ Morfeusz.instance.base(word).collect{|bf| Lexeme.new(bf)}
65
+ end
66
+
67
+ end
68
+ end
69
+ end
data/lib/nlp.rb ADDED
@@ -0,0 +1,4 @@
1
+ require 'stdlib/ext/array'
2
+ require 'stdlib/ext/string.rb'
3
+ require 'analyzer'
4
+
@@ -0,0 +1,18 @@
1
+ module NLP
2
+ class RIDCategory < Category
3
+
4
+ def primary?
5
+ root == :PIERWOTNE
6
+ end
7
+
8
+ def secondary?
9
+ root == :WTORNE
10
+ end
11
+
12
+ def emotions?
13
+ root == :EMOCJE
14
+ end
15
+
16
+
17
+ end
18
+ end
data/lib/sentence.rb ADDED
@@ -0,0 +1,14 @@
1
+ module NLP
2
+ class Sentence
3
+ attr_reader :tokens
4
+ def initialize()
5
+ @tokens = []
6
+ end
7
+
8
+ def << tokens
9
+ @tokens.concat tokens
10
+ end
11
+
12
+
13
+ end
14
+ end
@@ -0,0 +1,6 @@
1
+ class Array
2
+ def tail
3
+ self[1..-1]
4
+ end
5
+ end
6
+
@@ -0,0 +1,19 @@
1
+ class String
2
+ alias old_memeber []
3
+
4
+ def ord (index)
5
+ self.old_memeber index
6
+ end
7
+
8
+ def get(index)
9
+ self.scan(/./)[index]
10
+ end
11
+
12
+ def set(index,value)
13
+ arr = self.scan(/./)
14
+ arr[index] = value
15
+ self.replace(arr.join)
16
+ value
17
+ end
18
+ end
19
+
data/lib/stree.rb ADDED
@@ -0,0 +1,85 @@
1
+
2
+ module NLP
3
+ class SearchTree
4
+ ALPHABET = %w{* - a ą b c ć d e ę f g h i j k l ł m n ń o ó p r s ś t u w y z ź ż}
5
+ SYMBOLS = %w{* - : - / ) (}
6
+ attr_accessor :value
7
+ attr_accessor :subtrees
8
+
9
+ # 0 -> *
10
+ # 1 -> -
11
+ # 2 -> a
12
+ # 33 -> ź
13
+ def initialize
14
+ @subtrees = Array.new( 34, nil )
15
+ @value = []
16
+ end
17
+
18
+ def insert( s, value )
19
+ priv_insert( s.scan(/./), value )
20
+ end
21
+
22
+ def find( s )
23
+ priv_find( s.scan(/./) )
24
+ end
25
+
26
+
27
+ protected
28
+ def key( chr )
29
+ unless chr
30
+ raise ArgumentError, "Argument chr is nil"
31
+ end
32
+ rval = ALPHABET.index(chr) || -1
33
+ if rval > 35
34
+ rval = -1 # invalid character
35
+ end
36
+
37
+ rval
38
+ end
39
+
40
+ def priv_insert( s, value )
41
+ if s.empty?
42
+ @value.push value
43
+ else
44
+ index = key( s.first )
45
+ subtree = if @subtrees[index] == nil
46
+ @subtrees[index] = SearchTree.new
47
+ else
48
+ @subtrees[index]
49
+ end
50
+
51
+ subtree.priv_insert( s.tail, value )
52
+ end
53
+ end
54
+
55
+ def priv_find( search )
56
+ if @subtrees[0]
57
+ @subtrees[0].value
58
+ else
59
+ if search.empty?
60
+ value
61
+ else
62
+ index = key( search.first )
63
+ if @subtrees[index]
64
+ @subtrees[index].priv_find( search.tail )
65
+ else
66
+ nil
67
+ end
68
+ end
69
+ end
70
+ end
71
+
72
+ public
73
+ def traverse()
74
+ list = []
75
+ yield @value
76
+ list.concat @subrees if @subtrees != nil
77
+ loop do
78
+ break if list.empty?
79
+ node = list.shift
80
+ yield node.value
81
+ list.concat node.subtrees if node.subtrees != nil
82
+ end
83
+ end
84
+ end
85
+ end
data/lib/token.rb ADDED
@@ -0,0 +1,35 @@
1
+ require 'inflectable'
2
+ module NLP
3
+ class Token
4
+ attr_reader :orth
5
+ attr_reader :tags
6
+
7
+
8
+ def initialize(orth,tags)
9
+ @orth = orth
10
+ @tags = tags
11
+ end
12
+
13
+ def interp?
14
+ @tags.eql? "interp"
15
+ end
16
+
17
+ def word?
18
+ not interp? and not number?
19
+ end
20
+
21
+ def number?
22
+ @tags.include?("tnum")
23
+ end
24
+
25
+ def integer?
26
+ @tags.include?("tnum:integer")
27
+ end
28
+
29
+ def float?
30
+ @tags.include?("tnum:frac")
31
+ end
32
+
33
+
34
+ end
35
+ end
@@ -0,0 +1,137 @@
1
+
2
+ require 'rexml/document'
3
+ require 'soap/rpc/driver'
4
+ module NLP
5
+ class TokenScanner
6
+ include REXML
7
+ attr_reader :text, :tokens
8
+
9
+ def initialize(text, method)
10
+ @pos = 0
11
+
12
+ if method === :file
13
+ puts "laduje tekst"
14
+ @text = load_lemated_text(text)
15
+ elsif method === :text
16
+ @text = lematize_text(text)
17
+ else
18
+ @text = text
19
+ end
20
+
21
+ @tokens = flatten_text(@text)
22
+ end
23
+
24
+ def next(type)
25
+ @pos+=1
26
+ case type
27
+ when :word
28
+ while @pos < @tokens.size and !@tokens[@pos].word?
29
+ @pos+= 1
30
+ end
31
+
32
+ when :interp
33
+ while @pos < @tokens.size and !@tokens[@pos].interp?
34
+ @pos+= 1
35
+ end
36
+
37
+ when :number
38
+ while @pos < @tokens.size and !@tokens[@pos].number?
39
+ @pos+= 1
40
+ end
41
+
42
+ end
43
+ end
44
+
45
+ def current
46
+
47
+ if @pos == @tokens.size
48
+ nil
49
+ else
50
+ @tokens[@pos]
51
+ end
52
+
53
+ end
54
+
55
+ def index
56
+ @pos
57
+ end
58
+
59
+ def end?
60
+ @pos == tokens.size
61
+ end
62
+
63
+
64
+ private
65
+
66
+ def flatten_text(text)
67
+ flattened = []
68
+ text.each { |s| s.tokens.each {|t| flattened.push t } }
69
+ flattened
70
+ end
71
+
72
+ def load_lemated_text(text_file)
73
+
74
+ t1 = Thread.new do
75
+ `takipi -i #{text_file} -o output.xml -it TXT`
76
+ end
77
+ t1.join
78
+
79
+
80
+
81
+
82
+
83
+ text = []
84
+ File.open("output.xml") do |f|
85
+ doc = Document.new(f)
86
+
87
+ doc.elements.each("*/chunkList/chunk") do |chunk|
88
+ sentence = Sentence.new
89
+ tokens = []
90
+
91
+ chunk.elements.each("tok") do |tok|
92
+ word = tok.elements[1].text
93
+ lemat, inflect = ""
94
+
95
+ tok.elements.each("lex") do |lex|
96
+ if lex.has_attributes?
97
+ lemat = lex.elements[1].text
98
+ inflect = lex.elements[2].text
99
+ end
100
+ end
101
+
102
+ tokens << Word.new(word,lemat,inflect)
103
+ end
104
+
105
+ sentence << tokens
106
+ text << sentence
107
+ end
108
+ end
109
+ text
110
+ end
111
+
112
+ def lematize_text(text)
113
+ temp_text = []
114
+ text.split(/\.|!|\?/).each do |s|
115
+ sentence = Sentence.new
116
+ sentence << s.split(" ").collect{ |t|
117
+ if word = Morfeusz::Lexeme.find(t)
118
+ if word[0]
119
+ Word.new(t,word[0].base_form,"")
120
+ else
121
+ Word.new(t,"","")
122
+ end
123
+ else
124
+ Word.new(t,"","")
125
+ end
126
+ }
127
+ temp_text.push sentence
128
+ end
129
+ temp_text
130
+ end
131
+
132
+
133
+
134
+
135
+ end
136
+
137
+ end