nlp 0.2.6 → 0.2.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/dict/liwc +11 -12
- data/dict/rid +7 -7
- data/lib/nlp.rb +27 -5
- data/lib/stdlib/ext/array.rb +1 -0
- data/lib/text_statistics.rb +53 -0
- metadata +42 -55
- data/lib/analyzer.rb +0 -50
- data/lib/category.rb +0 -27
- data/lib/dictionary.rb +0 -85
- data/lib/emoticon.rb +0 -14
- data/lib/inflectable.rb +0 -60
- data/lib/lemmatizer.rb +0 -112
- data/lib/liwc_analyzer.rb +0 -74
- data/lib/liwc_category.rb +0 -61
- data/lib/meaningable.rb +0 -69
- data/lib/rid_analyzer.rb +0 -10
- data/lib/rid_category.rb +0 -17
- data/lib/sentence.rb +0 -24
- data/lib/statistic.rb +0 -55
- data/lib/stdlib/ext/string.rb +0 -19
- data/lib/stree.rb +0 -85
- data/lib/takipi_web_service.rb +0 -51
- data/lib/text.rb +0 -26
- data/lib/token.rb +0 -37
- data/lib/token_scanner.rb +0 -60
- data/lib/word.rb +0 -23
- data/test/analyzer_test.rb +0 -25
- data/test/helper.rb +0 -9
- data/test/lemmatizer_test.rb +0 -73
- data/test/meaningable_test.rb +0 -28
- data/test/nlp_test_suite.rb +0 -11
- data/test/sentence_test.rb +0 -26
- data/test/test_nlp.rb +0 -7
- data/test/text_test.rb +0 -29
- data/test/token_scanner_test.rb +0 -28
- data/test/token_test.rb +0 -37
- data/test/word_test.rb +0 -45
data/lib/dictionary.rb
DELETED
@@ -1,85 +0,0 @@
|
|
1
|
-
|
2
|
-
require 'stree'
|
3
|
-
require 'category'
|
4
|
-
require 'rid_category'
|
5
|
-
require 'liwc_category'
|
6
|
-
|
7
|
-
module NLP
|
8
|
-
|
9
|
-
class Dictionary
|
10
|
-
attr_accessor :tree
|
11
|
-
|
12
|
-
|
13
|
-
def initialize(category_file=:rid,restore = true)
|
14
|
-
state_file = File.expand_path(DICTIONARY_CACHE_DIR+".#{category_file.to_s}")
|
15
|
-
if restore and File.exist?(state_file)
|
16
|
-
d = Dictionary.restore(state_file)
|
17
|
-
@tree = d.tree
|
18
|
-
else
|
19
|
-
@tree = SearchTree.new
|
20
|
-
load_categories(File.dirname(__FILE__)+"/../dict/#{category_file.to_s}", category_file )
|
21
|
-
store(state_file)
|
22
|
-
end
|
23
|
-
|
24
|
-
end
|
25
|
-
|
26
|
-
def store( state_file )
|
27
|
-
File.open( File.expand_path( state_file ), "w" ) do |file|
|
28
|
-
Marshal.dump( self, file )
|
29
|
-
end
|
30
|
-
self
|
31
|
-
end
|
32
|
-
|
33
|
-
def self.restore( state_file )
|
34
|
-
File.open( File.expand_path( state_file ) ) do |file|
|
35
|
-
Marshal.restore( file )
|
36
|
-
end
|
37
|
-
end
|
38
|
-
|
39
|
-
def find(word)
|
40
|
-
if @exception_pattern && @exception_pattern =~ word
|
41
|
-
nil
|
42
|
-
else
|
43
|
-
@tree.find(word)
|
44
|
-
end
|
45
|
-
end
|
46
|
-
|
47
|
-
|
48
|
-
def load_categories(category_file,type)
|
49
|
-
category = nil
|
50
|
-
primary = nil
|
51
|
-
secondary = nil
|
52
|
-
tertiary = nil
|
53
|
-
|
54
|
-
if type == :rid
|
55
|
-
cat_class = NLP.const_get("RIDCategory")
|
56
|
-
else
|
57
|
-
cat_class = NLP.const_get("LIWCCategory")
|
58
|
-
end
|
59
|
-
|
60
|
-
File.open( category_file ) do |file|
|
61
|
-
while line = file.gets
|
62
|
-
line.chomp!
|
63
|
-
begin
|
64
|
-
lead, rest = line.scan( /(\t*)(.*)/ ).first
|
65
|
-
if lead.size == 0
|
66
|
-
category = primary = cat_class.new(rest)
|
67
|
-
secondary, tertiary = nil
|
68
|
-
elsif lead.size == 1
|
69
|
-
category = secondary = cat_class.new(rest, primary )
|
70
|
-
tertiary = nil
|
71
|
-
elsif lead.size == 2 && ( cat = line.strip.index(/^[A-ZĄŚĘĆŃŹŻŁÓ_]+$/)) && cat >= 0
|
72
|
-
category = tertiary = cat_class.new( rest, secondary )
|
73
|
-
else
|
74
|
-
word = rest.downcase.gsub( /\s*\(1\)$/, '' )
|
75
|
-
@tree.insert( word, category )
|
76
|
-
end
|
77
|
-
rescue
|
78
|
-
raise
|
79
|
-
end
|
80
|
-
end
|
81
|
-
end
|
82
|
-
end
|
83
|
-
end
|
84
|
-
end
|
85
|
-
|
data/lib/emoticon.rb
DELETED
data/lib/inflectable.rb
DELETED
@@ -1,60 +0,0 @@
|
|
1
|
-
module Inflectable
|
2
|
-
|
3
|
-
GRAM_CAT = {
|
4
|
-
#rzeczownik
|
5
|
-
:adj => 'przymiotnik',
|
6
|
-
[:subst,:depr] => 'rzeczownik',
|
7
|
-
:adv => 'przyslowek',
|
8
|
-
:num => 'liczebnik',
|
9
|
-
[:pron,:siebie] => 'zaimek',
|
10
|
-
:prep => 'przyimek',
|
11
|
-
#liczby
|
12
|
-
:sg => 'liczba_pojedyncza',
|
13
|
-
:pl => 'liczba_mnoga',
|
14
|
-
|
15
|
-
#Przypadki
|
16
|
-
:nom => 'mianownik',
|
17
|
-
:gen => 'dopelniacz',
|
18
|
-
:dat => 'celownik',
|
19
|
-
:acc => 'biernik',
|
20
|
-
:inst => 'narzednik',
|
21
|
-
:loc => 'miejscownik',
|
22
|
-
:voc => 'wolacz',
|
23
|
-
|
24
|
-
#Rodzaje
|
25
|
-
:m1 => 'meski_osobowy',
|
26
|
-
:m2 => 'meski_zwierzecy',
|
27
|
-
:m3 => 'meski_rzeczowy',
|
28
|
-
:f => 'zenski',
|
29
|
-
:n1 => 'nijaki_zbiorowy',
|
30
|
-
:n2 => 'nijaki zwykly',
|
31
|
-
:p1 => 'przymnogi_osobowy',
|
32
|
-
:p2 => 'przymnogi_zwykly',
|
33
|
-
:p3 => 'przymnogi_opisowy',
|
34
|
-
|
35
|
-
#Osoby
|
36
|
-
:pri => "pierwsza_osoba",
|
37
|
-
:sec => "druga_osoba",
|
38
|
-
:ter => "trzecia_osoba",
|
39
|
-
|
40
|
-
#Stopień
|
41
|
-
:pos => "stopien_rowny",
|
42
|
-
:comp => "stopien_wyzszy",
|
43
|
-
:sup => "stopien_najwyzszy"
|
44
|
-
}
|
45
|
-
|
46
|
-
GRAM_CAT.each do |key,value|
|
47
|
-
|
48
|
-
define_method(value+"?"){
|
49
|
-
inflection.split(":").any?{|e|
|
50
|
-
if key.is_a? Array
|
51
|
-
key.any?{|k| e.include? k.to_s}
|
52
|
-
else
|
53
|
-
e.include? key.to_s
|
54
|
-
end
|
55
|
-
}
|
56
|
-
}
|
57
|
-
end
|
58
|
-
|
59
|
-
|
60
|
-
end
|
data/lib/lemmatizer.rb
DELETED
@@ -1,112 +0,0 @@
|
|
1
|
-
require 'takipi_web_service'
|
2
|
-
require 'rexml/document'
|
3
|
-
require 'morfeusz'
|
4
|
-
|
5
|
-
module NLP
|
6
|
-
class Lemmatizer
|
7
|
-
|
8
|
-
include REXML
|
9
|
-
|
10
|
-
def self.lemmatize(text,method=nil,input_type=nil)
|
11
|
-
if text.is_a? File
|
12
|
-
str = text.read
|
13
|
-
text.close
|
14
|
-
elsif text.is_a? String
|
15
|
-
str = text
|
16
|
-
else
|
17
|
-
raise ArgumentError, "Argument is not String or File"
|
18
|
-
end
|
19
|
-
|
20
|
-
if method === :takipi
|
21
|
-
takipi_lemmatize(str,input_type)
|
22
|
-
|
23
|
-
#Default lematization method is Morfeusz
|
24
|
-
else
|
25
|
-
morfeusz_lemmatize(str)
|
26
|
-
end
|
27
|
-
end
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
def self.takipi_lemmatize(text,method)
|
32
|
-
|
33
|
-
if method === :local
|
34
|
-
|
35
|
-
xml_file = TAKIPI_XML_FILE
|
36
|
-
|
37
|
-
t1 = Thread.new do
|
38
|
-
`echo '#{text}' > /tmp/text.txt; takipi -i /tmp/text.txt -o #{xml_file} -it TXT`
|
39
|
-
end
|
40
|
-
|
41
|
-
t1.join
|
42
|
-
|
43
|
-
f = File.open(xml_file,"r")
|
44
|
-
doc = Document.new f
|
45
|
-
|
46
|
-
elsif method === :remote
|
47
|
-
xml = TakipiWebService.request(text)
|
48
|
-
doc = Document.new xml
|
49
|
-
else
|
50
|
-
raise ArgumentError, 'Argument is not :local or :remote'
|
51
|
-
end
|
52
|
-
|
53
|
-
parse_lemmatized_xml(doc)
|
54
|
-
end
|
55
|
-
|
56
|
-
|
57
|
-
def self.morfeusz_lemmatize(text)
|
58
|
-
temp_text = Text.new
|
59
|
-
|
60
|
-
#simple tagger
|
61
|
-
#TODO lemmatizer should take TokenScanner object that defines
|
62
|
-
#how split string
|
63
|
-
text.split(/\.|!|\?/).each do |s|
|
64
|
-
sentence = Sentence.new
|
65
|
-
sentence << s.split(" ").collect{ |t|
|
66
|
-
if word = Morfeusz::Lexeme.find(t)
|
67
|
-
if word[0]
|
68
|
-
Word.new(t,word[0].base_form,"")
|
69
|
-
else
|
70
|
-
Word.new(t,"","")
|
71
|
-
end
|
72
|
-
else
|
73
|
-
Word.new(t,"","")
|
74
|
-
end
|
75
|
-
}
|
76
|
-
temp_text << sentence
|
77
|
-
end
|
78
|
-
temp_text
|
79
|
-
end
|
80
|
-
|
81
|
-
|
82
|
-
def self.parse_lemmatized_xml(doc)
|
83
|
-
|
84
|
-
text = Text.new
|
85
|
-
|
86
|
-
doc.elements.each("*/chunkList/chunk") do |chunk|
|
87
|
-
sentence = Sentence.new
|
88
|
-
tokens = []
|
89
|
-
|
90
|
-
chunk.elements.each("tok") do |tok|
|
91
|
-
word = tok.elements[1].text
|
92
|
-
lemat, inflect = ""
|
93
|
-
|
94
|
-
tok.elements.each("lex") do |lex|
|
95
|
-
if lex.has_attributes?
|
96
|
-
lemat = lex.elements[1].text
|
97
|
-
inflect = lex.elements[2].text
|
98
|
-
end
|
99
|
-
end
|
100
|
-
|
101
|
-
tokens << Word.new(word,lemat,inflect)
|
102
|
-
end
|
103
|
-
|
104
|
-
sentence << tokens
|
105
|
-
text << sentence
|
106
|
-
end
|
107
|
-
text
|
108
|
-
end
|
109
|
-
|
110
|
-
|
111
|
-
end
|
112
|
-
end
|
data/lib/liwc_analyzer.rb
DELETED
@@ -1,74 +0,0 @@
|
|
1
|
-
module NLP
|
2
|
-
class LIWCAnalyzer < Analyzer
|
3
|
-
|
4
|
-
def initialize(dicts)
|
5
|
-
@dictionary = Dictionary.new(:liwc)
|
6
|
-
end
|
7
|
-
|
8
|
-
|
9
|
-
def analyze(scanner)
|
10
|
-
|
11
|
-
results = Statistic.new
|
12
|
-
results.hash = {
|
13
|
-
:long_words => [],
|
14
|
-
:zaimki => [],
|
15
|
-
:zaimki1 => [],
|
16
|
-
:zaimki2 => [],
|
17
|
-
:zaimki3 => [],
|
18
|
-
:przyimki => [],
|
19
|
-
:numbers => [],
|
20
|
-
:emotion => [],
|
21
|
-
:social => [],
|
22
|
-
:personal => [],
|
23
|
-
:posemotion => [],
|
24
|
-
:negemotion => [],
|
25
|
-
:wulgar => [],
|
26
|
-
:cognitive => []
|
27
|
-
}
|
28
|
-
|
29
|
-
while token = scanner.current
|
30
|
-
word = token.lemat
|
31
|
-
|
32
|
-
categories = @dictionary.find(word.gsub( /[^\w-]/, "" ))
|
33
|
-
unless categories.nil?
|
34
|
-
categories.each do |category|
|
35
|
-
puts "Znalazłem słowo #{word} : #{category} root: #{category.root}"
|
36
|
-
token.category = category
|
37
|
-
results.add(word,category)
|
38
|
-
|
39
|
-
|
40
|
-
results[:emotion].push token.orth if token.emotion?
|
41
|
-
results[:social].push token.orth if token.social?
|
42
|
-
results[:personal].push token.orth if token.personal?
|
43
|
-
results[:wulgar].push token.orth if token.bad_word?
|
44
|
-
results[:cognitive].push token.orth if token.cognitive?
|
45
|
-
|
46
|
-
results[:posemotion].push token.orth if token.positive_emotion?
|
47
|
-
results[:negemotion].push token.orth if token.negative_emotion?
|
48
|
-
end
|
49
|
-
end
|
50
|
-
|
51
|
-
#words longer than 10
|
52
|
-
results[:long_words].push word if word.jlength > 10
|
53
|
-
if token.zaimek?
|
54
|
-
results[:zaimki].push word
|
55
|
-
|
56
|
-
results[:zaimki1].push token.orth if word === 'ja' or word === 'my'
|
57
|
-
results[:zaimki2].push token.orth if word === 'ty' or word === 'wy'
|
58
|
-
results[:zaimki3].push token.orth if word === 'on'
|
59
|
-
end
|
60
|
-
|
61
|
-
results[:przyimki].push word if token.przyimek?
|
62
|
-
results[:numbers].push token.orth if token.number? or token.liczebnik?
|
63
|
-
|
64
|
-
|
65
|
-
results.total_words += 1
|
66
|
-
scanner.next(:alphanum)
|
67
|
-
end
|
68
|
-
results
|
69
|
-
|
70
|
-
end
|
71
|
-
|
72
|
-
end
|
73
|
-
|
74
|
-
end
|
data/lib/liwc_category.rb
DELETED
@@ -1,61 +0,0 @@
|
|
1
|
-
module NLP
|
2
|
-
|
3
|
-
class LIWCCategory < Category
|
4
|
-
|
5
|
-
#primary categories
|
6
|
-
|
7
|
-
def linguistic?
|
8
|
-
root == :PIERWOTNE
|
9
|
-
end
|
10
|
-
|
11
|
-
def psychological?
|
12
|
-
root == :PROCESY_PSYCHOLOGICZNE
|
13
|
-
end
|
14
|
-
|
15
|
-
|
16
|
-
def relative?
|
17
|
-
root === :RELATYWNOSC
|
18
|
-
end
|
19
|
-
|
20
|
-
def personal?
|
21
|
-
root == :OSOBISTE
|
22
|
-
end
|
23
|
-
|
24
|
-
#second categories
|
25
|
-
|
26
|
-
def emotion?
|
27
|
-
path.include? 'EMOCJE'
|
28
|
-
|
29
|
-
end
|
30
|
-
|
31
|
-
def positive_emotion?
|
32
|
-
path.include? 'POZYTYWNE_EMOCJE'
|
33
|
-
|
34
|
-
end
|
35
|
-
|
36
|
-
def negative_emotion?
|
37
|
-
path.include? 'NEGATYWNE_EMOCJE'
|
38
|
-
|
39
|
-
end
|
40
|
-
|
41
|
-
def cognitive?
|
42
|
-
path.include? 'KOGNITYWNE_PROCESY'
|
43
|
-
|
44
|
-
end
|
45
|
-
|
46
|
-
def sense?
|
47
|
-
path.include? 'ZMYSLY'
|
48
|
-
end
|
49
|
-
|
50
|
-
def social?
|
51
|
-
path.include? 'SOCIAL'
|
52
|
-
|
53
|
-
end
|
54
|
-
|
55
|
-
def bad_word?
|
56
|
-
path.include? 'WULGAR'
|
57
|
-
end
|
58
|
-
|
59
|
-
|
60
|
-
end
|
61
|
-
end
|
data/lib/meaningable.rb
DELETED
@@ -1,69 +0,0 @@
|
|
1
|
-
module Meaningable
|
2
|
-
|
3
|
-
#LIWC
|
4
|
-
#primary categories
|
5
|
-
|
6
|
-
def linguistic?
|
7
|
-
category.root == :PIERWOTNE
|
8
|
-
end
|
9
|
-
|
10
|
-
def psychological?
|
11
|
-
category.root == :PROCESY_PSYCHOLOGICZNE
|
12
|
-
end
|
13
|
-
|
14
|
-
|
15
|
-
def relative?
|
16
|
-
category.root === :RELATYWNOSC
|
17
|
-
end
|
18
|
-
|
19
|
-
def personal?
|
20
|
-
category.root == :OSOBISTE
|
21
|
-
end
|
22
|
-
|
23
|
-
#second categories
|
24
|
-
|
25
|
-
def emotion?
|
26
|
-
category.path.include? 'EMOCJE'
|
27
|
-
|
28
|
-
end
|
29
|
-
|
30
|
-
def positive_emotion?
|
31
|
-
category.path.include? 'POZYTYWNE_EMOCJE'
|
32
|
-
|
33
|
-
end
|
34
|
-
|
35
|
-
def negative_emotion?
|
36
|
-
category.path.include? 'NEGATYWNE_EMOCJE'
|
37
|
-
|
38
|
-
end
|
39
|
-
|
40
|
-
def cognitive?
|
41
|
-
category.path.include? 'KOGNITYWNE_PROCESY'
|
42
|
-
|
43
|
-
end
|
44
|
-
|
45
|
-
def sense?
|
46
|
-
category.path.include? 'ZMYSLY'
|
47
|
-
end
|
48
|
-
|
49
|
-
def social?
|
50
|
-
category.path.include? 'SOCIAL'
|
51
|
-
|
52
|
-
end
|
53
|
-
|
54
|
-
def bad_word?
|
55
|
-
category.path.include? 'WULGAR'
|
56
|
-
end
|
57
|
-
|
58
|
-
|
59
|
-
#SEMANTIC
|
60
|
-
def synonym?(other)
|
61
|
-
|
62
|
-
end
|
63
|
-
|
64
|
-
def synonyms
|
65
|
-
|
66
|
-
end
|
67
|
-
|
68
|
-
|
69
|
-
end
|