nlp 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
data/lib/analyzer.rb ADDED
@@ -0,0 +1,72 @@
1
+ require 'dictionary'
2
+ require 'morfeusz'
3
+ require 'token'
4
+ require 'word'
5
+ require 'emoticon'
6
+ require 'sentence'
7
+ require "token_scanner.rb"
8
+ require "inflectable"
9
+ require "meaningable"
10
+
11
+ $KODE = "UTF8"
12
+
13
+ module NLP
14
+
15
+ class Analyzer
16
+
17
+ include REXML
18
+ #Lexeme = Apohllo::Morfeusz::Lexeme
19
+
20
+ def initialize( category_file, restore = true )
21
+ state_file = File.expand_path(Dictionary::CACHE_DIR)
22
+ if restore
23
+ @dictionary = Dictionary.restore(state_file)
24
+ else
25
+ @dictionary = Dictionary.new
26
+ @dictionary.load_categories( category_file )
27
+ @dictionary.store(state_file)
28
+ end
29
+
30
+ end
31
+
32
+
33
+ def analyze( scanner)
34
+
35
+ results = {
36
+ :word_count => 0,
37
+ :word_total => 0,
38
+ :scores => Hash.new { 0 },
39
+ :words => []
40
+ }
41
+
42
+ while token = scanner.current
43
+ word = token.lemat
44
+
45
+ categories = @dictionary.find( word.gsub( /[^\w-]/, "" ) )
46
+ unless categories.nil?
47
+ categories.each do |category|
48
+
49
+ puts "#{word} : #{category.name}"
50
+ results[:scores][category] = results[:scores][category] + 1
51
+ end
52
+
53
+ results[:word_count] += 1
54
+ results[:words].push word
55
+ end
56
+
57
+ results[:word_total] += 1
58
+ scanner.next(:word)
59
+ end
60
+
61
+ results[:sorted_scores] = results[:scores].to_a.sort_by { |result| -result[1] }
62
+ results[:classes] = {
63
+ :primary => Float(results[:sorted_scores].select { |result| result[0].primary? }.inject( 0 ) { |count,result| count + result[1] }) / results[:word_count],
64
+ :secondary => Float(results[:sorted_scores].select { |result| result[0].secondary? }.inject( 0 ) { |count,result| count + result[1] }) / results[:word_count],
65
+ :emotions => Float(results[:sorted_scores].select { |result| result[0].emotions? }.inject( 0 ) { |count,result| count + result[1] }) / results[:word_count]
66
+ }
67
+
68
+ results
69
+ end
70
+
71
+ end
72
+ end
data/lib/category.rb ADDED
@@ -0,0 +1,28 @@
1
+ module NLP
2
+ class Category
3
+ attr_reader :parent, :name
4
+
5
+ def initialize( name, parent = nil )
6
+ @parent = parent
7
+ @name = name.to_sym
8
+ end
9
+
10
+ def path
11
+ @parent ? ( @parent.path + '/' + name.to_s ) : name.to_s
12
+ end
13
+
14
+ def root
15
+ category = self
16
+ while category.parent != nil
17
+ category = category.parent
18
+ end
19
+ category.name
20
+ end
21
+
22
+ def to_s
23
+ "#{path.inspect}"
24
+ end
25
+
26
+
27
+ end
28
+ end
data/lib/dictionary.rb ADDED
@@ -0,0 +1,70 @@
1
+
2
+ require 'stree'
3
+ require 'category'
4
+ require 'rid_category'
5
+
6
+ module NLP
7
+ class Dictionary
8
+ CACHE_DIR = '~/.rima'
9
+ def initialize
10
+ @tree = SearchTree.new
11
+ @categories = {}
12
+ end
13
+
14
+ def store( state_file )
15
+ File.open( File.expand_path( state_file ), "w" ) do |file|
16
+ Marshal.dump( self, file )
17
+ end
18
+ self
19
+ end
20
+
21
+ def self.restore( state_file )
22
+ File.open( File.expand_path( state_file ) ) do |file|
23
+ Marshal.restore( file )
24
+ end
25
+ end
26
+
27
+
28
+
29
+ def find( word )
30
+ if @exception_pattern && @exception_pattern =~ word
31
+ nil
32
+ else
33
+ @tree.find( word )
34
+ end
35
+ end
36
+
37
+
38
+ def load_categories( category_file )
39
+ category = nil
40
+ primary = nil
41
+ secondary = nil
42
+ tertiary = nil
43
+
44
+ File.open( category_file ) do |file|
45
+ while line = file.gets
46
+ line.chomp!
47
+ begin
48
+ lead, rest = line.scan( /(\t*)(.*)/ ).first
49
+ if lead.size == 0
50
+ category = primary = RIDCategory.new( rest )
51
+ secondary, tertiary = nil
52
+ elsif lead.size == 1
53
+ category = secondary = RIDCategory.new( rest, primary )
54
+ tertiary = nil
55
+ elsif lead.size == 2 && ( cat = line.strip.index(/^[A-ZĄŚĘĆŃŹŻŁÓ]+$/)) && cat >= 0
56
+ category = tertiary = RIDCategory.new( rest, secondary )
57
+ else
58
+ word = rest.downcase.gsub( /\s*\(1\)$/, '' )
59
+ @tree.insert( word, category )
60
+ end
61
+ rescue
62
+ puts "Error for line: #{line}"
63
+ raise
64
+ end
65
+ end
66
+ end
67
+ end
68
+ end
69
+ end
70
+
data/lib/emoticon.rb ADDED
@@ -0,0 +1,13 @@
1
+ module NLP
2
+ class Emoticon < Token
3
+ include Meaningable
4
+
5
+ def initialize(tokens,tags)
6
+ @orth = tokens.join("")
7
+ @tags = 'emoticon'
8
+ end
9
+
10
+
11
+ end
12
+ end
13
+
@@ -0,0 +1,59 @@
1
+ module Inflectable
2
+
3
+ GRAM_CAT = {
4
+ #rzeczownik
5
+ [:subst, :depr] => 'rzeczownik',
6
+ :adj => 'przymiotnik',
7
+ :adv => 'przyslowek',
8
+ :num => 'liczebnik',
9
+ [:pron,:siebie] => 'zaimek',
10
+ :prep => 'przyimek',
11
+ #liczby
12
+ :sg => 'liczba_pojedyncza',
13
+ :pl => 'liczba_mnoga',
14
+
15
+ #Przypadki
16
+ :nom => 'mianownik',
17
+ :gen => 'dopelniacz',
18
+ :dat => 'celownik',
19
+ :acc => 'biernik',
20
+ :inst => 'narzednik',
21
+ :loc => 'miejscownik',
22
+ :voc => 'wolacz',
23
+
24
+ #Rodzaje
25
+ :m1 => 'męski_osobowy',
26
+ :m2 => 'męski_zwierzęcy',
27
+ :m3 => 'męski_rzeczowy',
28
+ :f => 'żeński',
29
+ :n1 => 'nijaki zbiorowy',
30
+ :n2 => 'nijaki zwykły',
31
+ :p1 => 'przymnogi osobowy',
32
+ :p2 => 'przymnogi zwykły',
33
+ :p3 => 'przymnogi opisowy',
34
+
35
+ #Osoby
36
+ :pri => "pierwsza_osoba",
37
+ :sec => "druga_osoba",
38
+ :ter => "trzecia_osoba",
39
+
40
+ #Stopień
41
+ :pos => "stopien_rowny",
42
+ :comp => "stopien_wyzszy",
43
+ :sup => "stopien_najwyzszy"
44
+ }
45
+
46
+ GRAM_CAT.each do |key,value|
47
+ if key.kind_of? Array
48
+ key = key.first
49
+ else
50
+ define_method(value+"?"){
51
+ inflection.split(":").any?{|e| e.include? key.to_s[1..-1]}
52
+ }
53
+ end
54
+ end
55
+
56
+
57
+
58
+
59
+ end
@@ -0,0 +1,7 @@
1
+ module NLP
2
+
3
+ class LIWCCategory < Category
4
+
5
+
6
+ end
7
+ end
@@ -0,0 +1,55 @@
1
+ module Meaningable
2
+
3
+ #LIWC
4
+ def positive_emotion?
5
+
6
+ end
7
+
8
+ def negative_emotion?
9
+
10
+ end
11
+
12
+ def emotion?
13
+
14
+ end
15
+
16
+ def cognitive?
17
+ end
18
+
19
+ def social?
20
+
21
+ end
22
+
23
+ #EXPERIMENTAl
24
+
25
+ def bad_word?
26
+
27
+ end
28
+
29
+
30
+ def emoticon?
31
+
32
+ end
33
+
34
+
35
+ def filler?
36
+
37
+ end
38
+
39
+ def nonfluent?
40
+
41
+ end
42
+
43
+ #SEMANTIC
44
+ def synonym?(other)
45
+
46
+ end
47
+
48
+ def synonyms
49
+
50
+ end
51
+
52
+
53
+
54
+
55
+ end
data/lib/morfeusz.rb ADDED
@@ -0,0 +1,69 @@
1
+ # Ruby bindings for Morfeusz v. 0.1
2
+ # Author: Aleksander Pohl
3
+ # apohllo@o2.pl
4
+
5
+ require 'rubygems'
6
+ require 'inline'
7
+ require 'singleton'
8
+ require 'iconv'
9
+ module NLP
10
+ module Morfeusz
11
+ MORFOPT_ENCODING = 1
12
+ MORFEUSZ_UTF_8 = 8
13
+ class Morfeusz
14
+ include Singleton
15
+
16
+ inline(:C) do |builder|
17
+ builder.include '"morfeusz.h"'
18
+ builder.add_compile_flags '-lmorfeusz', '-I/home/knife/morf/include/'
19
+ builder.c <<-END
20
+ void initialize(){
21
+ morfeusz_set_option(#{MORFOPT_ENCODING},#{MORFEUSZ_UTF_8});
22
+ }
23
+ END
24
+
25
+ builder.c <<-END
26
+ char * about(){
27
+ return morfeusz_about();
28
+ }
29
+ END
30
+
31
+ builder.c <<-END
32
+ VALUE _base(VALUE str){
33
+ char * p;
34
+ int index = 0;
35
+ VALUE arr = rb_ary_new();
36
+ int id_push = rb_intern("push");
37
+ p = StringValuePtr(str);
38
+ InterpMorf* result = morfeusz_analyse(p);
39
+ InterpMorf el;
40
+ while((el = result[index++]).k != -1){
41
+ if(el.haslo != NULL){
42
+ rb_funcall(arr,id_push,1,rb_str_new2(el.haslo));
43
+ }
44
+ }
45
+ return arr;
46
+ }
47
+ END
48
+
49
+ def base(word)
50
+ # _base(word)
51
+ _base(word).collect{|e| e}
52
+ end
53
+
54
+ end
55
+ end
56
+
57
+ class Lexeme
58
+ attr_reader :base_form
59
+ def initialize(base_form)
60
+ @base_form = base_form
61
+ end
62
+
63
+ def self.find(word)
64
+ Morfeusz.instance.base(word).collect{|bf| Lexeme.new(bf)}
65
+ end
66
+
67
+ end
68
+ end
69
+ end
data/lib/nlp.rb ADDED
@@ -0,0 +1,4 @@
1
+ require 'stdlib/ext/array'
2
+ require 'stdlib/ext/string.rb'
3
+ require 'analyzer'
4
+
@@ -0,0 +1,18 @@
1
+ module NLP
2
+ class RIDCategory < Category
3
+
4
+ def primary?
5
+ root == :PIERWOTNE
6
+ end
7
+
8
+ def secondary?
9
+ root == :WTORNE
10
+ end
11
+
12
+ def emotions?
13
+ root == :EMOCJE
14
+ end
15
+
16
+
17
+ end
18
+ end
data/lib/sentence.rb ADDED
@@ -0,0 +1,14 @@
1
+ module NLP
2
+ class Sentence
3
+ attr_reader :tokens
4
+ def initialize()
5
+ @tokens = []
6
+ end
7
+
8
+ def << tokens
9
+ @tokens.concat tokens
10
+ end
11
+
12
+
13
+ end
14
+ end
@@ -0,0 +1,6 @@
1
+ class Array
2
+ def tail
3
+ self[1..-1]
4
+ end
5
+ end
6
+
@@ -0,0 +1,19 @@
1
+ class String
2
+ alias old_memeber []
3
+
4
+ def ord (index)
5
+ self.old_memeber index
6
+ end
7
+
8
+ def get(index)
9
+ self.scan(/./)[index]
10
+ end
11
+
12
+ def set(index,value)
13
+ arr = self.scan(/./)
14
+ arr[index] = value
15
+ self.replace(arr.join)
16
+ value
17
+ end
18
+ end
19
+
data/lib/stree.rb ADDED
@@ -0,0 +1,85 @@
1
+
2
+ module NLP
3
+ class SearchTree
4
+ ALPHABET = %w{* - a ą b c ć d e ę f g h i j k l ł m n ń o ó p r s ś t u w y z ź ż}
5
+ SYMBOLS = %w{* - : - / ) (}
6
+ attr_accessor :value
7
+ attr_accessor :subtrees
8
+
9
+ # 0 -> *
10
+ # 1 -> -
11
+ # 2 -> a
12
+ # 33 -> ź
13
+ def initialize
14
+ @subtrees = Array.new( 34, nil )
15
+ @value = []
16
+ end
17
+
18
+ def insert( s, value )
19
+ priv_insert( s.scan(/./), value )
20
+ end
21
+
22
+ def find( s )
23
+ priv_find( s.scan(/./) )
24
+ end
25
+
26
+
27
+ protected
28
+ def key( chr )
29
+ unless chr
30
+ raise ArgumentError, "Argument chr is nil"
31
+ end
32
+ rval = ALPHABET.index(chr) || -1
33
+ if rval > 35
34
+ rval = -1 # invalid character
35
+ end
36
+
37
+ rval
38
+ end
39
+
40
+ def priv_insert( s, value )
41
+ if s.empty?
42
+ @value.push value
43
+ else
44
+ index = key( s.first )
45
+ subtree = if @subtrees[index] == nil
46
+ @subtrees[index] = SearchTree.new
47
+ else
48
+ @subtrees[index]
49
+ end
50
+
51
+ subtree.priv_insert( s.tail, value )
52
+ end
53
+ end
54
+
55
+ def priv_find( search )
56
+ if @subtrees[0]
57
+ @subtrees[0].value
58
+ else
59
+ if search.empty?
60
+ value
61
+ else
62
+ index = key( search.first )
63
+ if @subtrees[index]
64
+ @subtrees[index].priv_find( search.tail )
65
+ else
66
+ nil
67
+ end
68
+ end
69
+ end
70
+ end
71
+
72
+ public
73
+ def traverse()
74
+ list = []
75
+ yield @value
76
+ list.concat @subrees if @subtrees != nil
77
+ loop do
78
+ break if list.empty?
79
+ node = list.shift
80
+ yield node.value
81
+ list.concat node.subtrees if node.subtrees != nil
82
+ end
83
+ end
84
+ end
85
+ end
data/lib/token.rb ADDED
@@ -0,0 +1,35 @@
1
+ require 'inflectable'
2
+ module NLP
3
+ class Token
4
+ attr_reader :orth
5
+ attr_reader :tags
6
+
7
+
8
+ def initialize(orth,tags)
9
+ @orth = orth
10
+ @tags = tags
11
+ end
12
+
13
+ def interp?
14
+ @tags.eql? "interp"
15
+ end
16
+
17
+ def word?
18
+ not interp? and not number?
19
+ end
20
+
21
+ def number?
22
+ @tags.include?("tnum")
23
+ end
24
+
25
+ def integer?
26
+ @tags.include?("tnum:integer")
27
+ end
28
+
29
+ def float?
30
+ @tags.include?("tnum:frac")
31
+ end
32
+
33
+
34
+ end
35
+ end
@@ -0,0 +1,137 @@
1
+
2
+ require 'rexml/document'
3
+ require 'soap/rpc/driver'
4
+ module NLP
5
+ class TokenScanner
6
+ include REXML
7
+ attr_reader :text, :tokens
8
+
9
+ def initialize(text, method)
10
+ @pos = 0
11
+
12
+ if method === :file
13
+ puts "laduje tekst"
14
+ @text = load_lemated_text(text)
15
+ elsif method === :text
16
+ @text = lematize_text(text)
17
+ else
18
+ @text = text
19
+ end
20
+
21
+ @tokens = flatten_text(@text)
22
+ end
23
+
24
+ def next(type)
25
+ @pos+=1
26
+ case type
27
+ when :word
28
+ while @pos < @tokens.size and !@tokens[@pos].word?
29
+ @pos+= 1
30
+ end
31
+
32
+ when :interp
33
+ while @pos < @tokens.size and !@tokens[@pos].interp?
34
+ @pos+= 1
35
+ end
36
+
37
+ when :number
38
+ while @pos < @tokens.size and !@tokens[@pos].number?
39
+ @pos+= 1
40
+ end
41
+
42
+ end
43
+ end
44
+
45
+ def current
46
+
47
+ if @pos == @tokens.size
48
+ nil
49
+ else
50
+ @tokens[@pos]
51
+ end
52
+
53
+ end
54
+
55
+ def index
56
+ @pos
57
+ end
58
+
59
+ def end?
60
+ @pos == tokens.size
61
+ end
62
+
63
+
64
+ private
65
+
66
+ def flatten_text(text)
67
+ flattened = []
68
+ text.each { |s| s.tokens.each {|t| flattened.push t } }
69
+ flattened
70
+ end
71
+
72
+ def load_lemated_text(text_file)
73
+
74
+ t1 = Thread.new do
75
+ `takipi -i #{text_file} -o output.xml -it TXT`
76
+ end
77
+ t1.join
78
+
79
+
80
+
81
+
82
+
83
+ text = []
84
+ File.open("output.xml") do |f|
85
+ doc = Document.new(f)
86
+
87
+ doc.elements.each("*/chunkList/chunk") do |chunk|
88
+ sentence = Sentence.new
89
+ tokens = []
90
+
91
+ chunk.elements.each("tok") do |tok|
92
+ word = tok.elements[1].text
93
+ lemat, inflect = ""
94
+
95
+ tok.elements.each("lex") do |lex|
96
+ if lex.has_attributes?
97
+ lemat = lex.elements[1].text
98
+ inflect = lex.elements[2].text
99
+ end
100
+ end
101
+
102
+ tokens << Word.new(word,lemat,inflect)
103
+ end
104
+
105
+ sentence << tokens
106
+ text << sentence
107
+ end
108
+ end
109
+ text
110
+ end
111
+
112
+ def lematize_text(text)
113
+ temp_text = []
114
+ text.split(/\.|!|\?/).each do |s|
115
+ sentence = Sentence.new
116
+ sentence << s.split(" ").collect{ |t|
117
+ if word = Morfeusz::Lexeme.find(t)
118
+ if word[0]
119
+ Word.new(t,word[0].base_form,"")
120
+ else
121
+ Word.new(t,"","")
122
+ end
123
+ else
124
+ Word.new(t,"","")
125
+ end
126
+ }
127
+ temp_text.push sentence
128
+ end
129
+ temp_text
130
+ end
131
+
132
+
133
+
134
+
135
+ end
136
+
137
+ end