nlp 0.2.5 → 0.2.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/analyzer.rb +15 -47
- data/lib/category.rb +7 -8
- data/lib/dictionary.rb +30 -28
- data/lib/emoticon.rb +8 -8
- data/lib/inflectable.rb +58 -59
- data/lib/lemmatizer.rb +86 -82
- data/lib/liwc_analyzer.rb +68 -91
- data/lib/liwc_category.rb +42 -43
- data/lib/meaningable.rb +44 -51
- data/lib/nlp.rb +10 -0
- data/lib/rid_analyzer.rb +5 -69
- data/lib/rid_category.rb +5 -6
- data/lib/sentence.rb +19 -11
- data/lib/statistic.rb +55 -0
- data/lib/stdlib/ext/array.rb +7 -0
- data/lib/stree.rb +39 -39
- data/lib/takipi_web_service.rb +45 -45
- data/lib/text.rb +18 -17
- data/lib/token.rb +28 -25
- data/lib/token_scanner.rb +43 -55
- data/lib/word.rb +14 -14
- data/test/analyzer_test.rb +25 -0
- data/test/lemmatizer_test.rb +73 -0
- data/test/meaningable_test.rb +28 -0
- data/test/nlp_test_suite.rb +11 -0
- data/test/sentence_test.rb +26 -0
- data/test/text_test.rb +29 -0
- data/test/token_scanner_test.rb +28 -0
- data/test/token_test.rb +37 -0
- data/test/word_test.rb +39 -36
- metadata +21 -5
- data/lib/takipi_web_service +0 -0
data/lib/analyzer.rb
CHANGED
@@ -9,72 +9,40 @@ require 'sentence'
|
|
9
9
|
require "token_scanner.rb"
|
10
10
|
require "lemmatizer"
|
11
11
|
require 'jcode'
|
12
|
+
require 'statistic'
|
12
13
|
$KODE = "UTF8"
|
13
14
|
|
14
15
|
module NLP
|
15
16
|
|
16
17
|
class Analyzer
|
17
18
|
|
18
|
-
|
19
|
-
|
20
|
-
def initialize( category_file, restore = true )
|
21
|
-
state_file = File.expand_path(Analyzer::CACHE_DIR)
|
22
|
-
if restore
|
23
|
-
@dictionary = Dictionary.restore(state_file)
|
24
|
-
else
|
25
|
-
@dictionary = Dictionary.new
|
26
|
-
@dictionary.load_categories( category_file, :rid => true )
|
27
|
-
@dictionary.store(state_file)
|
28
|
-
end
|
29
|
-
|
19
|
+
def initialize(dict)
|
20
|
+
@dictionary = Dictionary.new(dict)
|
30
21
|
end
|
31
|
-
|
32
|
-
|
33
|
-
def analyze( scanner)
|
34
|
-
|
35
|
-
results = {
|
36
|
-
:word_count => 0,
|
37
|
-
:word_total => 0,
|
38
|
-
:scores => Hash.new { 0 },
|
39
|
-
:words => []
|
40
|
-
}
|
41
22
|
|
42
23
|
|
24
|
+
def analyze(scanner)
|
25
|
+
|
26
|
+
results = Statistic.new
|
43
27
|
|
44
|
-
|
28
|
+
while token = scanner.current
|
45
29
|
word = token.lemat
|
46
30
|
|
47
|
-
categories = @dictionary.find(
|
31
|
+
categories = @dictionary.find(word.gsub(/[^\w-]/, "" ))
|
48
32
|
unless categories.nil?
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
results[:words].push word
|
54
|
-
end
|
55
|
-
|
56
|
-
|
33
|
+
categories.each do |category|
|
34
|
+
puts "Znalazłem słowo #{word} : #{category}"
|
35
|
+
results.add(word,category)
|
36
|
+
end
|
57
37
|
end
|
58
38
|
|
59
|
-
results
|
39
|
+
results.total_words += 1
|
60
40
|
scanner.next(:word)
|
61
|
-
|
62
|
-
|
63
|
-
results[:sorted_scores] = results[:scores].to_a.sort_by { |result| -result[1] }
|
64
|
-
primary_sum = results[:sorted_scores].select { |result| result[0].primary? }.inject( 0 ) { |count,result| count + result[1] }
|
65
|
-
secondary_sum = results[:sorted_scores].select { |result| result[0].secondary? }.inject( 0 ) { |count,result| count + result[1] }
|
66
|
-
emotion_sum = results[:sorted_scores].select { |result| result[0].emotions? }.inject( 0 ) { |count,result| count + result[1] }
|
67
|
-
|
41
|
+
end
|
68
42
|
|
69
|
-
results[:classes] = {
|
70
|
-
:primary => Float(primary_sum) / results[:word_count],
|
71
|
-
:secondary => Float(secondary_sum) / results[:word_count],
|
72
|
-
:emotions => Float(emotion_sum) / results[:word_count]
|
73
|
-
}
|
74
|
-
|
75
43
|
results
|
76
|
-
end
|
77
44
|
|
45
|
+
end
|
78
46
|
end
|
79
47
|
end
|
80
48
|
|
data/lib/category.rb
CHANGED
@@ -1,16 +1,16 @@
|
|
1
1
|
module NLP
|
2
2
|
class Category
|
3
3
|
attr_reader :parent, :name
|
4
|
-
|
5
|
-
def initialize(
|
4
|
+
|
5
|
+
def initialize(name, parent = nil)
|
6
6
|
@parent = parent
|
7
7
|
@name = name.to_sym
|
8
8
|
end
|
9
|
-
|
9
|
+
|
10
10
|
def path
|
11
|
-
@parent ? (
|
11
|
+
@parent ? (@parent.path + '/' + name.to_s) : name.to_s
|
12
12
|
end
|
13
|
-
|
13
|
+
|
14
14
|
def root
|
15
15
|
category = self
|
16
16
|
while category.parent != nil
|
@@ -18,11 +18,10 @@ module NLP
|
|
18
18
|
end
|
19
19
|
category.name
|
20
20
|
end
|
21
|
-
|
21
|
+
|
22
22
|
def to_s
|
23
23
|
"#{path.inspect}"
|
24
24
|
end
|
25
|
-
|
26
|
-
|
25
|
+
|
27
26
|
end
|
28
27
|
end
|
data/lib/dictionary.rb
CHANGED
@@ -5,12 +5,24 @@ require 'rid_category'
|
|
5
5
|
require 'liwc_category'
|
6
6
|
|
7
7
|
module NLP
|
8
|
+
|
8
9
|
class Dictionary
|
9
|
-
|
10
|
-
|
11
|
-
|
10
|
+
attr_accessor :tree
|
11
|
+
|
12
|
+
|
13
|
+
def initialize(category_file=:rid,restore = true)
|
14
|
+
state_file = File.expand_path(DICTIONARY_CACHE_DIR+".#{category_file.to_s}")
|
15
|
+
if restore and File.exist?(state_file)
|
16
|
+
d = Dictionary.restore(state_file)
|
17
|
+
@tree = d.tree
|
18
|
+
else
|
19
|
+
@tree = SearchTree.new
|
20
|
+
load_categories(File.dirname(__FILE__)+"/../dict/#{category_file.to_s}", category_file )
|
21
|
+
store(state_file)
|
22
|
+
end
|
23
|
+
|
12
24
|
end
|
13
|
-
|
25
|
+
|
14
26
|
def store( state_file )
|
15
27
|
File.open( File.expand_path( state_file ), "w" ) do |file|
|
16
28
|
Marshal.dump( self, file )
|
@@ -24,50 +36,40 @@ module NLP
|
|
24
36
|
end
|
25
37
|
end
|
26
38
|
|
27
|
-
|
28
|
-
|
29
|
-
def find( word )
|
39
|
+
def find(word)
|
30
40
|
if @exception_pattern && @exception_pattern =~ word
|
31
41
|
nil
|
32
42
|
else
|
33
|
-
@tree.find(
|
43
|
+
@tree.find(word)
|
34
44
|
end
|
35
45
|
end
|
36
|
-
|
37
46
|
|
38
|
-
|
47
|
+
|
48
|
+
def load_categories(category_file,type)
|
39
49
|
category = nil
|
40
50
|
primary = nil
|
41
51
|
secondary = nil
|
42
52
|
tertiary = nil
|
43
|
-
|
53
|
+
|
54
|
+
if type == :rid
|
55
|
+
cat_class = NLP.const_get("RIDCategory")
|
56
|
+
else
|
57
|
+
cat_class = NLP.const_get("LIWCCategory")
|
58
|
+
end
|
59
|
+
|
44
60
|
File.open( category_file ) do |file|
|
45
61
|
while line = file.gets
|
46
62
|
line.chomp!
|
47
63
|
begin
|
48
64
|
lead, rest = line.scan( /(\t*)(.*)/ ).first
|
49
65
|
if lead.size == 0
|
50
|
-
|
51
|
-
category = primary = RIDCategory.new( rest )
|
52
|
-
else
|
53
|
-
category = primary = LIWCCategory.new( rest )
|
54
|
-
end
|
55
|
-
|
66
|
+
category = primary = cat_class.new(rest)
|
56
67
|
secondary, tertiary = nil
|
57
68
|
elsif lead.size == 1
|
58
|
-
|
59
|
-
category = secondary = RIDCategory.new( rest, primary )
|
60
|
-
else
|
61
|
-
category = secondary = LIWCCategory.new(rest,primary)
|
62
|
-
end
|
69
|
+
category = secondary = cat_class.new(rest, primary )
|
63
70
|
tertiary = nil
|
64
71
|
elsif lead.size == 2 && ( cat = line.strip.index(/^[A-ZĄŚĘĆŃŹŻŁÓ_]+$/)) && cat >= 0
|
65
|
-
|
66
|
-
|
67
|
-
category = tertiary = RIDCategory.new( rest, secondary )
|
68
|
-
else
|
69
|
-
category = tertiary = LIWCCategory.new( rest, secondary )
|
70
|
-
end
|
72
|
+
category = tertiary = cat_class.new( rest, secondary )
|
71
73
|
else
|
72
74
|
word = rest.downcase.gsub( /\s*\(1\)$/, '' )
|
73
75
|
@tree.insert( word, category )
|
data/lib/emoticon.rb
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
require 'meaningable'
|
2
|
-
module NLP
|
3
|
-
class Emoticon < Token
|
4
|
-
include Meaningable
|
5
|
-
|
6
|
-
def initialize(tokens,tags)
|
7
|
-
@orth = tokens.join("")
|
8
|
-
@tags = 'emoticon'
|
9
|
-
end
|
10
2
|
|
3
|
+
module NLP
|
4
|
+
class Emoticon < Token
|
5
|
+
include Meaningable
|
11
6
|
|
7
|
+
def initialize(tokens,tags)
|
8
|
+
@orth = tokens.join("")
|
9
|
+
@tags = 'emoticon'
|
12
10
|
end
|
11
|
+
|
12
|
+
end
|
13
13
|
end
|
14
14
|
|
data/lib/inflectable.rb
CHANGED
@@ -1,61 +1,60 @@
|
|
1
1
|
module Inflectable
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
2
|
+
|
3
|
+
GRAM_CAT = {
|
4
|
+
#rzeczownik
|
5
|
+
:adj => 'przymiotnik',
|
6
|
+
[:subst,:depr] => 'rzeczownik',
|
7
|
+
:adv => 'przyslowek',
|
8
|
+
:num => 'liczebnik',
|
9
|
+
[:pron,:siebie] => 'zaimek',
|
10
|
+
:prep => 'przyimek',
|
11
|
+
#liczby
|
12
|
+
:sg => 'liczba_pojedyncza',
|
13
|
+
:pl => 'liczba_mnoga',
|
14
|
+
|
15
|
+
#Przypadki
|
16
|
+
:nom => 'mianownik',
|
17
|
+
:gen => 'dopelniacz',
|
18
|
+
:dat => 'celownik',
|
19
|
+
:acc => 'biernik',
|
20
|
+
:inst => 'narzednik',
|
21
|
+
:loc => 'miejscownik',
|
22
|
+
:voc => 'wolacz',
|
23
|
+
|
24
|
+
#Rodzaje
|
25
|
+
:m1 => 'meski_osobowy',
|
26
|
+
:m2 => 'meski_zwierzecy',
|
27
|
+
:m3 => 'meski_rzeczowy',
|
28
|
+
:f => 'zenski',
|
29
|
+
:n1 => 'nijaki_zbiorowy',
|
30
|
+
:n2 => 'nijaki zwykly',
|
31
|
+
:p1 => 'przymnogi_osobowy',
|
32
|
+
:p2 => 'przymnogi_zwykly',
|
33
|
+
:p3 => 'przymnogi_opisowy',
|
34
|
+
|
35
|
+
#Osoby
|
36
|
+
:pri => "pierwsza_osoba",
|
37
|
+
:sec => "druga_osoba",
|
38
|
+
:ter => "trzecia_osoba",
|
39
|
+
|
40
|
+
#Stopień
|
41
|
+
:pos => "stopien_rowny",
|
42
|
+
:comp => "stopien_wyzszy",
|
43
|
+
:sup => "stopien_najwyzszy"
|
44
|
+
}
|
45
|
+
|
46
|
+
GRAM_CAT.each do |key,value|
|
47
|
+
|
48
|
+
define_method(value+"?"){
|
49
|
+
inflection.split(":").any?{|e|
|
50
|
+
if key.is_a? Array
|
51
|
+
key.any?{|k| e.include? k.to_s}
|
52
|
+
else
|
53
|
+
e.include? key.to_s
|
54
|
+
end
|
55
|
+
}
|
56
|
+
}
|
57
|
+
end
|
58
|
+
|
59
|
+
|
61
60
|
end
|
data/lib/lemmatizer.rb
CHANGED
@@ -3,106 +3,110 @@ require 'rexml/document'
|
|
3
3
|
require 'morfeusz'
|
4
4
|
|
5
5
|
module NLP
|
6
|
-
class Lemmatizer
|
6
|
+
class Lemmatizer
|
7
|
+
|
7
8
|
include REXML
|
8
|
-
|
9
|
-
def self.lematize(text,method,input_type)
|
10
|
-
if text.is_a? File
|
11
|
-
str = text.read
|
12
|
-
text.close
|
13
|
-
elsif text.is_a? String
|
14
|
-
str = text
|
15
|
-
else
|
16
|
-
raise ArgumentError, "Argument is not String or File"
|
17
|
-
end
|
18
|
-
|
19
|
-
if method === :takipi
|
20
|
-
takipi_lematize(str,input_type)
|
21
|
-
#Default lematization method is Morfeusz
|
22
|
-
else
|
23
|
-
morfeusz_lematize(str)
|
24
|
-
end
|
25
9
|
|
10
|
+
def self.lemmatize(text,method=nil,input_type=nil)
|
11
|
+
if text.is_a? File
|
12
|
+
str = text.read
|
13
|
+
text.close
|
14
|
+
elsif text.is_a? String
|
15
|
+
str = text
|
16
|
+
else
|
17
|
+
raise ArgumentError, "Argument is not String or File"
|
18
|
+
end
|
19
|
+
|
20
|
+
if method === :takipi
|
21
|
+
takipi_lemmatize(str,input_type)
|
22
|
+
|
23
|
+
#Default lematization method is Morfeusz
|
24
|
+
else
|
25
|
+
morfeusz_lemmatize(str)
|
26
|
+
end
|
26
27
|
end
|
27
28
|
|
28
29
|
|
29
30
|
|
30
|
-
def self.
|
31
|
-
|
32
|
-
if method === :local
|
31
|
+
def self.takipi_lemmatize(text,method)
|
33
32
|
|
34
|
-
|
35
|
-
`takipi -i text.txt -o output.xml -it TXT`
|
36
|
-
end
|
33
|
+
if method === :local
|
37
34
|
|
38
|
-
|
35
|
+
xml_file = TAKIPI_XML_FILE
|
39
36
|
|
40
|
-
|
41
|
-
|
42
|
-
elsif method === :remote
|
43
|
-
xml = TakipiWebService.request(text)
|
44
|
-
doc = Document.new xml
|
45
|
-
else
|
46
|
-
raise ArgumentError, 'Argument is not :local or :remote'
|
37
|
+
t1 = Thread.new do
|
38
|
+
`echo '#{text}' > /tmp/text.txt; takipi -i /tmp/text.txt -o #{xml_file} -it TXT`
|
47
39
|
end
|
48
40
|
|
49
|
-
|
41
|
+
t1.join
|
42
|
+
|
43
|
+
f = File.open(xml_file,"r")
|
44
|
+
doc = Document.new f
|
45
|
+
|
46
|
+
elsif method === :remote
|
47
|
+
xml = TakipiWebService.request(text)
|
48
|
+
doc = Document.new xml
|
49
|
+
else
|
50
|
+
raise ArgumentError, 'Argument is not :local or :remote'
|
51
|
+
end
|
52
|
+
|
53
|
+
parse_lemmatized_xml(doc)
|
50
54
|
end
|
51
55
|
|
52
56
|
|
53
|
-
def self.
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
end
|
68
|
-
else
|
69
|
-
Word.new(t,"","")
|
70
|
-
end
|
71
|
-
}
|
72
|
-
temp_text.push sentence
|
73
|
-
end
|
74
|
-
temp_text
|
75
|
-
end
|
76
|
-
|
77
|
-
|
78
|
-
def self.parse_lematized_xml(doc)
|
79
|
-
|
80
|
-
text = Text.new
|
81
|
-
|
82
|
-
doc.elements.each("*/chunkList/chunk") do |chunk|
|
83
|
-
sentence = Sentence.new
|
84
|
-
tokens = []
|
85
|
-
|
86
|
-
chunk.elements.each("tok") do |tok|
|
87
|
-
word = tok.elements[1].text
|
88
|
-
lemat, inflect = ""
|
89
|
-
|
90
|
-
tok.elements.each("lex") do |lex|
|
91
|
-
if lex.has_attributes?
|
92
|
-
lemat = lex.elements[1].text
|
93
|
-
inflect = lex.elements[2].text
|
94
|
-
end
|
95
|
-
end
|
96
|
-
|
97
|
-
tokens << Word.new(word,lemat,inflect)
|
57
|
+
def self.morfeusz_lemmatize(text)
|
58
|
+
temp_text = Text.new
|
59
|
+
|
60
|
+
#simple tagger
|
61
|
+
#TODO lemmatizer should take TokenScanner object that defines
|
62
|
+
#how split string
|
63
|
+
text.split(/\.|!|\?/).each do |s|
|
64
|
+
sentence = Sentence.new
|
65
|
+
sentence << s.split(" ").collect{ |t|
|
66
|
+
if word = Morfeusz::Lexeme.find(t)
|
67
|
+
if word[0]
|
68
|
+
Word.new(t,word[0].base_form,"")
|
69
|
+
else
|
70
|
+
Word.new(t,"","")
|
98
71
|
end
|
72
|
+
else
|
73
|
+
Word.new(t,"","")
|
74
|
+
end
|
75
|
+
}
|
76
|
+
temp_text << sentence
|
77
|
+
end
|
78
|
+
temp_text
|
79
|
+
end
|
80
|
+
|
99
81
|
|
100
|
-
|
101
|
-
|
82
|
+
def self.parse_lemmatized_xml(doc)
|
83
|
+
|
84
|
+
text = Text.new
|
85
|
+
|
86
|
+
doc.elements.each("*/chunkList/chunk") do |chunk|
|
87
|
+
sentence = Sentence.new
|
88
|
+
tokens = []
|
89
|
+
|
90
|
+
chunk.elements.each("tok") do |tok|
|
91
|
+
word = tok.elements[1].text
|
92
|
+
lemat, inflect = ""
|
93
|
+
|
94
|
+
tok.elements.each("lex") do |lex|
|
95
|
+
if lex.has_attributes?
|
96
|
+
lemat = lex.elements[1].text
|
97
|
+
inflect = lex.elements[2].text
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
tokens << Word.new(word,lemat,inflect)
|
102
102
|
end
|
103
|
-
|
103
|
+
|
104
|
+
sentence << tokens
|
105
|
+
text << sentence
|
106
|
+
end
|
107
|
+
text
|
104
108
|
end
|
105
109
|
|
106
110
|
|
107
|
-
end
|
111
|
+
end
|
108
112
|
end
|