nlp 0.2.5 → 0.2.6
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/analyzer.rb +15 -47
- data/lib/category.rb +7 -8
- data/lib/dictionary.rb +30 -28
- data/lib/emoticon.rb +8 -8
- data/lib/inflectable.rb +58 -59
- data/lib/lemmatizer.rb +86 -82
- data/lib/liwc_analyzer.rb +68 -91
- data/lib/liwc_category.rb +42 -43
- data/lib/meaningable.rb +44 -51
- data/lib/nlp.rb +10 -0
- data/lib/rid_analyzer.rb +5 -69
- data/lib/rid_category.rb +5 -6
- data/lib/sentence.rb +19 -11
- data/lib/statistic.rb +55 -0
- data/lib/stdlib/ext/array.rb +7 -0
- data/lib/stree.rb +39 -39
- data/lib/takipi_web_service.rb +45 -45
- data/lib/text.rb +18 -17
- data/lib/token.rb +28 -25
- data/lib/token_scanner.rb +43 -55
- data/lib/word.rb +14 -14
- data/test/analyzer_test.rb +25 -0
- data/test/lemmatizer_test.rb +73 -0
- data/test/meaningable_test.rb +28 -0
- data/test/nlp_test_suite.rb +11 -0
- data/test/sentence_test.rb +26 -0
- data/test/text_test.rb +29 -0
- data/test/token_scanner_test.rb +28 -0
- data/test/token_test.rb +37 -0
- data/test/word_test.rb +39 -36
- metadata +21 -5
- data/lib/takipi_web_service +0 -0
data/lib/analyzer.rb
CHANGED
@@ -9,72 +9,40 @@ require 'sentence'
|
|
9
9
|
require "token_scanner.rb"
|
10
10
|
require "lemmatizer"
|
11
11
|
require 'jcode'
|
12
|
+
require 'statistic'
|
12
13
|
$KODE = "UTF8"
|
13
14
|
|
14
15
|
module NLP
|
15
16
|
|
16
17
|
class Analyzer
|
17
18
|
|
18
|
-
|
19
|
-
|
20
|
-
def initialize( category_file, restore = true )
|
21
|
-
state_file = File.expand_path(Analyzer::CACHE_DIR)
|
22
|
-
if restore
|
23
|
-
@dictionary = Dictionary.restore(state_file)
|
24
|
-
else
|
25
|
-
@dictionary = Dictionary.new
|
26
|
-
@dictionary.load_categories( category_file, :rid => true )
|
27
|
-
@dictionary.store(state_file)
|
28
|
-
end
|
29
|
-
|
19
|
+
def initialize(dict)
|
20
|
+
@dictionary = Dictionary.new(dict)
|
30
21
|
end
|
31
|
-
|
32
|
-
|
33
|
-
def analyze( scanner)
|
34
|
-
|
35
|
-
results = {
|
36
|
-
:word_count => 0,
|
37
|
-
:word_total => 0,
|
38
|
-
:scores => Hash.new { 0 },
|
39
|
-
:words => []
|
40
|
-
}
|
41
22
|
|
42
23
|
|
24
|
+
def analyze(scanner)
|
25
|
+
|
26
|
+
results = Statistic.new
|
43
27
|
|
44
|
-
|
28
|
+
while token = scanner.current
|
45
29
|
word = token.lemat
|
46
30
|
|
47
|
-
categories = @dictionary.find(
|
31
|
+
categories = @dictionary.find(word.gsub(/[^\w-]/, "" ))
|
48
32
|
unless categories.nil?
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
results[:words].push word
|
54
|
-
end
|
55
|
-
|
56
|
-
|
33
|
+
categories.each do |category|
|
34
|
+
puts "Znalazłem słowo #{word} : #{category}"
|
35
|
+
results.add(word,category)
|
36
|
+
end
|
57
37
|
end
|
58
38
|
|
59
|
-
results
|
39
|
+
results.total_words += 1
|
60
40
|
scanner.next(:word)
|
61
|
-
|
62
|
-
|
63
|
-
results[:sorted_scores] = results[:scores].to_a.sort_by { |result| -result[1] }
|
64
|
-
primary_sum = results[:sorted_scores].select { |result| result[0].primary? }.inject( 0 ) { |count,result| count + result[1] }
|
65
|
-
secondary_sum = results[:sorted_scores].select { |result| result[0].secondary? }.inject( 0 ) { |count,result| count + result[1] }
|
66
|
-
emotion_sum = results[:sorted_scores].select { |result| result[0].emotions? }.inject( 0 ) { |count,result| count + result[1] }
|
67
|
-
|
41
|
+
end
|
68
42
|
|
69
|
-
results[:classes] = {
|
70
|
-
:primary => Float(primary_sum) / results[:word_count],
|
71
|
-
:secondary => Float(secondary_sum) / results[:word_count],
|
72
|
-
:emotions => Float(emotion_sum) / results[:word_count]
|
73
|
-
}
|
74
|
-
|
75
43
|
results
|
76
|
-
end
|
77
44
|
|
45
|
+
end
|
78
46
|
end
|
79
47
|
end
|
80
48
|
|
data/lib/category.rb
CHANGED
@@ -1,16 +1,16 @@
|
|
1
1
|
module NLP
|
2
2
|
class Category
|
3
3
|
attr_reader :parent, :name
|
4
|
-
|
5
|
-
def initialize(
|
4
|
+
|
5
|
+
def initialize(name, parent = nil)
|
6
6
|
@parent = parent
|
7
7
|
@name = name.to_sym
|
8
8
|
end
|
9
|
-
|
9
|
+
|
10
10
|
def path
|
11
|
-
@parent ? (
|
11
|
+
@parent ? (@parent.path + '/' + name.to_s) : name.to_s
|
12
12
|
end
|
13
|
-
|
13
|
+
|
14
14
|
def root
|
15
15
|
category = self
|
16
16
|
while category.parent != nil
|
@@ -18,11 +18,10 @@ module NLP
|
|
18
18
|
end
|
19
19
|
category.name
|
20
20
|
end
|
21
|
-
|
21
|
+
|
22
22
|
def to_s
|
23
23
|
"#{path.inspect}"
|
24
24
|
end
|
25
|
-
|
26
|
-
|
25
|
+
|
27
26
|
end
|
28
27
|
end
|
data/lib/dictionary.rb
CHANGED
@@ -5,12 +5,24 @@ require 'rid_category'
|
|
5
5
|
require 'liwc_category'
|
6
6
|
|
7
7
|
module NLP
|
8
|
+
|
8
9
|
class Dictionary
|
9
|
-
|
10
|
-
|
11
|
-
|
10
|
+
attr_accessor :tree
|
11
|
+
|
12
|
+
|
13
|
+
def initialize(category_file=:rid,restore = true)
|
14
|
+
state_file = File.expand_path(DICTIONARY_CACHE_DIR+".#{category_file.to_s}")
|
15
|
+
if restore and File.exist?(state_file)
|
16
|
+
d = Dictionary.restore(state_file)
|
17
|
+
@tree = d.tree
|
18
|
+
else
|
19
|
+
@tree = SearchTree.new
|
20
|
+
load_categories(File.dirname(__FILE__)+"/../dict/#{category_file.to_s}", category_file )
|
21
|
+
store(state_file)
|
22
|
+
end
|
23
|
+
|
12
24
|
end
|
13
|
-
|
25
|
+
|
14
26
|
def store( state_file )
|
15
27
|
File.open( File.expand_path( state_file ), "w" ) do |file|
|
16
28
|
Marshal.dump( self, file )
|
@@ -24,50 +36,40 @@ module NLP
|
|
24
36
|
end
|
25
37
|
end
|
26
38
|
|
27
|
-
|
28
|
-
|
29
|
-
def find( word )
|
39
|
+
def find(word)
|
30
40
|
if @exception_pattern && @exception_pattern =~ word
|
31
41
|
nil
|
32
42
|
else
|
33
|
-
@tree.find(
|
43
|
+
@tree.find(word)
|
34
44
|
end
|
35
45
|
end
|
36
|
-
|
37
46
|
|
38
|
-
|
47
|
+
|
48
|
+
def load_categories(category_file,type)
|
39
49
|
category = nil
|
40
50
|
primary = nil
|
41
51
|
secondary = nil
|
42
52
|
tertiary = nil
|
43
|
-
|
53
|
+
|
54
|
+
if type == :rid
|
55
|
+
cat_class = NLP.const_get("RIDCategory")
|
56
|
+
else
|
57
|
+
cat_class = NLP.const_get("LIWCCategory")
|
58
|
+
end
|
59
|
+
|
44
60
|
File.open( category_file ) do |file|
|
45
61
|
while line = file.gets
|
46
62
|
line.chomp!
|
47
63
|
begin
|
48
64
|
lead, rest = line.scan( /(\t*)(.*)/ ).first
|
49
65
|
if lead.size == 0
|
50
|
-
|
51
|
-
category = primary = RIDCategory.new( rest )
|
52
|
-
else
|
53
|
-
category = primary = LIWCCategory.new( rest )
|
54
|
-
end
|
55
|
-
|
66
|
+
category = primary = cat_class.new(rest)
|
56
67
|
secondary, tertiary = nil
|
57
68
|
elsif lead.size == 1
|
58
|
-
|
59
|
-
category = secondary = RIDCategory.new( rest, primary )
|
60
|
-
else
|
61
|
-
category = secondary = LIWCCategory.new(rest,primary)
|
62
|
-
end
|
69
|
+
category = secondary = cat_class.new(rest, primary )
|
63
70
|
tertiary = nil
|
64
71
|
elsif lead.size == 2 && ( cat = line.strip.index(/^[A-ZĄŚĘĆŃŹŻŁÓ_]+$/)) && cat >= 0
|
65
|
-
|
66
|
-
|
67
|
-
category = tertiary = RIDCategory.new( rest, secondary )
|
68
|
-
else
|
69
|
-
category = tertiary = LIWCCategory.new( rest, secondary )
|
70
|
-
end
|
72
|
+
category = tertiary = cat_class.new( rest, secondary )
|
71
73
|
else
|
72
74
|
word = rest.downcase.gsub( /\s*\(1\)$/, '' )
|
73
75
|
@tree.insert( word, category )
|
data/lib/emoticon.rb
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
require 'meaningable'
|
2
|
-
module NLP
|
3
|
-
class Emoticon < Token
|
4
|
-
include Meaningable
|
5
|
-
|
6
|
-
def initialize(tokens,tags)
|
7
|
-
@orth = tokens.join("")
|
8
|
-
@tags = 'emoticon'
|
9
|
-
end
|
10
2
|
|
3
|
+
module NLP
|
4
|
+
class Emoticon < Token
|
5
|
+
include Meaningable
|
11
6
|
|
7
|
+
def initialize(tokens,tags)
|
8
|
+
@orth = tokens.join("")
|
9
|
+
@tags = 'emoticon'
|
12
10
|
end
|
11
|
+
|
12
|
+
end
|
13
13
|
end
|
14
14
|
|
data/lib/inflectable.rb
CHANGED
@@ -1,61 +1,60 @@
|
|
1
1
|
module Inflectable
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
2
|
+
|
3
|
+
GRAM_CAT = {
|
4
|
+
#rzeczownik
|
5
|
+
:adj => 'przymiotnik',
|
6
|
+
[:subst,:depr] => 'rzeczownik',
|
7
|
+
:adv => 'przyslowek',
|
8
|
+
:num => 'liczebnik',
|
9
|
+
[:pron,:siebie] => 'zaimek',
|
10
|
+
:prep => 'przyimek',
|
11
|
+
#liczby
|
12
|
+
:sg => 'liczba_pojedyncza',
|
13
|
+
:pl => 'liczba_mnoga',
|
14
|
+
|
15
|
+
#Przypadki
|
16
|
+
:nom => 'mianownik',
|
17
|
+
:gen => 'dopelniacz',
|
18
|
+
:dat => 'celownik',
|
19
|
+
:acc => 'biernik',
|
20
|
+
:inst => 'narzednik',
|
21
|
+
:loc => 'miejscownik',
|
22
|
+
:voc => 'wolacz',
|
23
|
+
|
24
|
+
#Rodzaje
|
25
|
+
:m1 => 'meski_osobowy',
|
26
|
+
:m2 => 'meski_zwierzecy',
|
27
|
+
:m3 => 'meski_rzeczowy',
|
28
|
+
:f => 'zenski',
|
29
|
+
:n1 => 'nijaki_zbiorowy',
|
30
|
+
:n2 => 'nijaki zwykly',
|
31
|
+
:p1 => 'przymnogi_osobowy',
|
32
|
+
:p2 => 'przymnogi_zwykly',
|
33
|
+
:p3 => 'przymnogi_opisowy',
|
34
|
+
|
35
|
+
#Osoby
|
36
|
+
:pri => "pierwsza_osoba",
|
37
|
+
:sec => "druga_osoba",
|
38
|
+
:ter => "trzecia_osoba",
|
39
|
+
|
40
|
+
#Stopień
|
41
|
+
:pos => "stopien_rowny",
|
42
|
+
:comp => "stopien_wyzszy",
|
43
|
+
:sup => "stopien_najwyzszy"
|
44
|
+
}
|
45
|
+
|
46
|
+
GRAM_CAT.each do |key,value|
|
47
|
+
|
48
|
+
define_method(value+"?"){
|
49
|
+
inflection.split(":").any?{|e|
|
50
|
+
if key.is_a? Array
|
51
|
+
key.any?{|k| e.include? k.to_s}
|
52
|
+
else
|
53
|
+
e.include? key.to_s
|
54
|
+
end
|
55
|
+
}
|
56
|
+
}
|
57
|
+
end
|
58
|
+
|
59
|
+
|
61
60
|
end
|
data/lib/lemmatizer.rb
CHANGED
@@ -3,106 +3,110 @@ require 'rexml/document'
|
|
3
3
|
require 'morfeusz'
|
4
4
|
|
5
5
|
module NLP
|
6
|
-
class Lemmatizer
|
6
|
+
class Lemmatizer
|
7
|
+
|
7
8
|
include REXML
|
8
|
-
|
9
|
-
def self.lematize(text,method,input_type)
|
10
|
-
if text.is_a? File
|
11
|
-
str = text.read
|
12
|
-
text.close
|
13
|
-
elsif text.is_a? String
|
14
|
-
str = text
|
15
|
-
else
|
16
|
-
raise ArgumentError, "Argument is not String or File"
|
17
|
-
end
|
18
|
-
|
19
|
-
if method === :takipi
|
20
|
-
takipi_lematize(str,input_type)
|
21
|
-
#Default lematization method is Morfeusz
|
22
|
-
else
|
23
|
-
morfeusz_lematize(str)
|
24
|
-
end
|
25
9
|
|
10
|
+
def self.lemmatize(text,method=nil,input_type=nil)
|
11
|
+
if text.is_a? File
|
12
|
+
str = text.read
|
13
|
+
text.close
|
14
|
+
elsif text.is_a? String
|
15
|
+
str = text
|
16
|
+
else
|
17
|
+
raise ArgumentError, "Argument is not String or File"
|
18
|
+
end
|
19
|
+
|
20
|
+
if method === :takipi
|
21
|
+
takipi_lemmatize(str,input_type)
|
22
|
+
|
23
|
+
#Default lematization method is Morfeusz
|
24
|
+
else
|
25
|
+
morfeusz_lemmatize(str)
|
26
|
+
end
|
26
27
|
end
|
27
28
|
|
28
29
|
|
29
30
|
|
30
|
-
def self.
|
31
|
-
|
32
|
-
if method === :local
|
31
|
+
def self.takipi_lemmatize(text,method)
|
33
32
|
|
34
|
-
|
35
|
-
`takipi -i text.txt -o output.xml -it TXT`
|
36
|
-
end
|
33
|
+
if method === :local
|
37
34
|
|
38
|
-
|
35
|
+
xml_file = TAKIPI_XML_FILE
|
39
36
|
|
40
|
-
|
41
|
-
|
42
|
-
elsif method === :remote
|
43
|
-
xml = TakipiWebService.request(text)
|
44
|
-
doc = Document.new xml
|
45
|
-
else
|
46
|
-
raise ArgumentError, 'Argument is not :local or :remote'
|
37
|
+
t1 = Thread.new do
|
38
|
+
`echo '#{text}' > /tmp/text.txt; takipi -i /tmp/text.txt -o #{xml_file} -it TXT`
|
47
39
|
end
|
48
40
|
|
49
|
-
|
41
|
+
t1.join
|
42
|
+
|
43
|
+
f = File.open(xml_file,"r")
|
44
|
+
doc = Document.new f
|
45
|
+
|
46
|
+
elsif method === :remote
|
47
|
+
xml = TakipiWebService.request(text)
|
48
|
+
doc = Document.new xml
|
49
|
+
else
|
50
|
+
raise ArgumentError, 'Argument is not :local or :remote'
|
51
|
+
end
|
52
|
+
|
53
|
+
parse_lemmatized_xml(doc)
|
50
54
|
end
|
51
55
|
|
52
56
|
|
53
|
-
def self.
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
end
|
68
|
-
else
|
69
|
-
Word.new(t,"","")
|
70
|
-
end
|
71
|
-
}
|
72
|
-
temp_text.push sentence
|
73
|
-
end
|
74
|
-
temp_text
|
75
|
-
end
|
76
|
-
|
77
|
-
|
78
|
-
def self.parse_lematized_xml(doc)
|
79
|
-
|
80
|
-
text = Text.new
|
81
|
-
|
82
|
-
doc.elements.each("*/chunkList/chunk") do |chunk|
|
83
|
-
sentence = Sentence.new
|
84
|
-
tokens = []
|
85
|
-
|
86
|
-
chunk.elements.each("tok") do |tok|
|
87
|
-
word = tok.elements[1].text
|
88
|
-
lemat, inflect = ""
|
89
|
-
|
90
|
-
tok.elements.each("lex") do |lex|
|
91
|
-
if lex.has_attributes?
|
92
|
-
lemat = lex.elements[1].text
|
93
|
-
inflect = lex.elements[2].text
|
94
|
-
end
|
95
|
-
end
|
96
|
-
|
97
|
-
tokens << Word.new(word,lemat,inflect)
|
57
|
+
def self.morfeusz_lemmatize(text)
|
58
|
+
temp_text = Text.new
|
59
|
+
|
60
|
+
#simple tagger
|
61
|
+
#TODO lemmatizer should take TokenScanner object that defines
|
62
|
+
#how split string
|
63
|
+
text.split(/\.|!|\?/).each do |s|
|
64
|
+
sentence = Sentence.new
|
65
|
+
sentence << s.split(" ").collect{ |t|
|
66
|
+
if word = Morfeusz::Lexeme.find(t)
|
67
|
+
if word[0]
|
68
|
+
Word.new(t,word[0].base_form,"")
|
69
|
+
else
|
70
|
+
Word.new(t,"","")
|
98
71
|
end
|
72
|
+
else
|
73
|
+
Word.new(t,"","")
|
74
|
+
end
|
75
|
+
}
|
76
|
+
temp_text << sentence
|
77
|
+
end
|
78
|
+
temp_text
|
79
|
+
end
|
80
|
+
|
99
81
|
|
100
|
-
|
101
|
-
|
82
|
+
def self.parse_lemmatized_xml(doc)
|
83
|
+
|
84
|
+
text = Text.new
|
85
|
+
|
86
|
+
doc.elements.each("*/chunkList/chunk") do |chunk|
|
87
|
+
sentence = Sentence.new
|
88
|
+
tokens = []
|
89
|
+
|
90
|
+
chunk.elements.each("tok") do |tok|
|
91
|
+
word = tok.elements[1].text
|
92
|
+
lemat, inflect = ""
|
93
|
+
|
94
|
+
tok.elements.each("lex") do |lex|
|
95
|
+
if lex.has_attributes?
|
96
|
+
lemat = lex.elements[1].text
|
97
|
+
inflect = lex.elements[2].text
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
tokens << Word.new(word,lemat,inflect)
|
102
102
|
end
|
103
|
-
|
103
|
+
|
104
|
+
sentence << tokens
|
105
|
+
text << sentence
|
106
|
+
end
|
107
|
+
text
|
104
108
|
end
|
105
109
|
|
106
110
|
|
107
|
-
end
|
111
|
+
end
|
108
112
|
end
|