nlp 0.2.7 → 0.2.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/analizators/analyzer.rb +28 -0
- data/lib/analizators/liwc_analyzer.rb +68 -0
- data/lib/analizators/rid_analyzer.rb +10 -0
- data/lib/dictionaries/category.rb +27 -0
- data/lib/dictionaries/dictionary.rb +76 -0
- data/lib/dictionaries/liwc_category.rb +54 -0
- data/lib/dictionaries/pl_trie.rb +31 -0
- data/lib/dictionaries/rid_category.rb +21 -0
- data/lib/nlp.rb +0 -1
- data/lib/tagger/emoticon.rb +13 -0
- data/lib/tagger/inflectable.rb +59 -0
- data/lib/tagger/lemmatizer.rb +112 -0
- data/lib/tagger/meaningable.rb +63 -0
- data/lib/tagger/sentence.rb +24 -0
- data/lib/tagger/takipi_web_service.rb +51 -0
- data/lib/tagger/text.rb +24 -0
- data/lib/tagger/token.rb +45 -0
- data/lib/tagger/token_scanner.rb +58 -0
- data/lib/tagger/word.rb +20 -0
- metadata +21 -4
- data/lib/morfeusz.rb +0 -69
@@ -0,0 +1,28 @@
|
|
1
|
+
module NLP
|
2
|
+
|
3
|
+
class Analyzer
|
4
|
+
|
5
|
+
def initialize(dict)
|
6
|
+
@dictionary = Dictionary.new(dict)
|
7
|
+
end
|
8
|
+
|
9
|
+
|
10
|
+
def analyze(scanner)
|
11
|
+
|
12
|
+
results = TextStatistics.new
|
13
|
+
|
14
|
+
while token = scanner.current
|
15
|
+
word = token.lemat
|
16
|
+
|
17
|
+
categories = @dictionary.find(word)
|
18
|
+
results.add(word,categories) unless categories.nil?
|
19
|
+
results.total_words += 1
|
20
|
+
scanner.next(:word)
|
21
|
+
end
|
22
|
+
|
23
|
+
results
|
24
|
+
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
@@ -0,0 +1,68 @@
|
|
1
|
+
module NLP
|
2
|
+
class LIWCAnalyzer < Analyzer
|
3
|
+
|
4
|
+
def initialize
|
5
|
+
@dictionary = Dictionary.new(:liwc)
|
6
|
+
end
|
7
|
+
|
8
|
+
|
9
|
+
def analyze(scanner)
|
10
|
+
|
11
|
+
results = TextStatistics.new
|
12
|
+
results.hash = {
|
13
|
+
:long_words => [],
|
14
|
+
:zaimki => [],
|
15
|
+
:zaimki1 => [],
|
16
|
+
:zaimki2 => [],
|
17
|
+
:zaimki3 => [],
|
18
|
+
:przyimki => [],
|
19
|
+
:numbers => [],
|
20
|
+
:emotion => [],
|
21
|
+
:social => [],
|
22
|
+
:personal => [],
|
23
|
+
:posemotion => [],
|
24
|
+
:negemotion => [],
|
25
|
+
:wulgar => [],
|
26
|
+
:cognitive => []
|
27
|
+
}
|
28
|
+
|
29
|
+
while token = scanner.current
|
30
|
+
word = token.lemat
|
31
|
+
categories = @dictionary.find(word.gsub( /[^\w-]/, "" ))
|
32
|
+
|
33
|
+
unless categories.nil?
|
34
|
+
results.add(word,categories)
|
35
|
+
token.category = categories.first
|
36
|
+
|
37
|
+
results[:emotion].push token.orth if token.emotion?
|
38
|
+
results[:social].push token.orth if token.social?
|
39
|
+
results[:personal].push token.orth if token.personal?
|
40
|
+
results[:wulgar].push token.orth if token.bad_word?
|
41
|
+
results[:cognitive].push token.orth if token.cognitive?
|
42
|
+
|
43
|
+
results[:posemotion].push token.orth if token.positive_emotion?
|
44
|
+
results[:negemotion].push token.orth if token.negative_emotion?
|
45
|
+
end
|
46
|
+
#words longer than 10
|
47
|
+
results[:long_words].push word if word.jlength > 10
|
48
|
+
if token.zaimek?
|
49
|
+
results[:zaimki].push word
|
50
|
+
|
51
|
+
results[:zaimki1].push token.orth if word === 'ja' or word === 'my'
|
52
|
+
results[:zaimki2].push token.orth if word === 'ty' or word === 'wy'
|
53
|
+
results[:zaimki3].push token.orth if word === 'on'
|
54
|
+
end
|
55
|
+
|
56
|
+
results[:przyimki].push word if token.przyimek?
|
57
|
+
results[:numbers].push token.orth if token.number? or token.liczebnik?
|
58
|
+
|
59
|
+
results.total_words += 1
|
60
|
+
scanner.next(:alphanum)
|
61
|
+
end
|
62
|
+
results
|
63
|
+
|
64
|
+
end
|
65
|
+
|
66
|
+
end
|
67
|
+
|
68
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
module NLP
|
2
|
+
class Category
|
3
|
+
attr_reader :parent, :name
|
4
|
+
|
5
|
+
def initialize(name, parent = nil)
|
6
|
+
@parent = parent
|
7
|
+
@name = name.to_sym
|
8
|
+
end
|
9
|
+
|
10
|
+
def path
|
11
|
+
@parent ? (@parent.path + '/' + name.to_s) : name.to_s
|
12
|
+
end
|
13
|
+
|
14
|
+
def root
|
15
|
+
category = self
|
16
|
+
while category.parent != nil
|
17
|
+
category = category.parent
|
18
|
+
end
|
19
|
+
category.name
|
20
|
+
end
|
21
|
+
|
22
|
+
def to_s
|
23
|
+
"#{path.inspect}"
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,76 @@
|
|
1
|
+
module NLP
|
2
|
+
class Dictionary
|
3
|
+
|
4
|
+
attr_accessor :tree
|
5
|
+
|
6
|
+
def initialize(category_file=:rid,restore = true)
|
7
|
+
state_file = File.expand_path(DICTIONARY_CACHE_DIR+".#{category_file.to_s}")
|
8
|
+
if restore and File.exist?(state_file)
|
9
|
+
@tree = Dictionary.restore(state_file)
|
10
|
+
else
|
11
|
+
@tree = PlTrie.new
|
12
|
+
load_categories(File.dirname(__FILE__)+"/../../dict/#{category_file.to_s}", category_file )
|
13
|
+
store(state_file)
|
14
|
+
end
|
15
|
+
|
16
|
+
end
|
17
|
+
|
18
|
+
def store( state_file )
|
19
|
+
File.open( File.expand_path( state_file ), "w" ) do |file|
|
20
|
+
Marshal.dump( self.tree, file )
|
21
|
+
end
|
22
|
+
self
|
23
|
+
end
|
24
|
+
|
25
|
+
def self.restore( state_file )
|
26
|
+
File.open( File.expand_path( state_file ) ) do |file|
|
27
|
+
Marshal.restore( file )
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def find(word)
|
32
|
+
begin
|
33
|
+
@tree.find(word)
|
34
|
+
rescue
|
35
|
+
nil
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def load_categories(category_file,type)
|
40
|
+
category = nil
|
41
|
+
primary = nil
|
42
|
+
secondary = nil
|
43
|
+
tertiary = nil
|
44
|
+
|
45
|
+
if type == :rid
|
46
|
+
cat_class = NLP.const_get("RIDCategory")
|
47
|
+
else
|
48
|
+
cat_class = NLP.const_get("LIWCCategory")
|
49
|
+
end
|
50
|
+
|
51
|
+
File.open(category_file) do |file|
|
52
|
+
while line = file.gets
|
53
|
+
line.chomp!
|
54
|
+
begin
|
55
|
+
lead, rest = line.scan(/(\t*)(.*)/).first
|
56
|
+
if lead.size == 0
|
57
|
+
category = primary = cat_class.new(rest)
|
58
|
+
secondary, tertiary = nil
|
59
|
+
elsif lead.size == 1
|
60
|
+
category = secondary = cat_class.new(rest, primary)
|
61
|
+
tertiary = nil
|
62
|
+
elsif lead.size == 2 && ( cat = line.strip.index(/^[A-ZĄŚĘĆŃŹŻŁÓ_]+$/)) && cat >= 0
|
63
|
+
category = tertiary = cat_class.new( rest, secondary )
|
64
|
+
else
|
65
|
+
word = rest.downcase.gsub( /\s*\(1\)$/, '' )
|
66
|
+
@tree.insert(word, category)
|
67
|
+
end
|
68
|
+
rescue
|
69
|
+
raise
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
@@ -0,0 +1,54 @@
|
|
1
|
+
module NLP
|
2
|
+
class LIWCCategory < Category
|
3
|
+
|
4
|
+
#primary categories
|
5
|
+
|
6
|
+
def linguistic?
|
7
|
+
root == :PIERWOTNE
|
8
|
+
end
|
9
|
+
|
10
|
+
def psychological?
|
11
|
+
root == :PROCESY_PSYCHOLOGICZNE
|
12
|
+
end
|
13
|
+
|
14
|
+
|
15
|
+
def relative?
|
16
|
+
root === :RELATYWNOSC
|
17
|
+
end
|
18
|
+
|
19
|
+
def personal?
|
20
|
+
root == :OSOBISTE
|
21
|
+
end
|
22
|
+
|
23
|
+
#second categories
|
24
|
+
|
25
|
+
def emotion?
|
26
|
+
path.include? 'EMOCJE'
|
27
|
+
end
|
28
|
+
|
29
|
+
def positive_emotion?
|
30
|
+
path.include? 'POZYTYWNE_EMOCJE'
|
31
|
+
end
|
32
|
+
|
33
|
+
def negative_emotion?
|
34
|
+
path.include? 'NEGATYWNE_EMOCJE'
|
35
|
+
end
|
36
|
+
|
37
|
+
def cognitive?
|
38
|
+
path.include? 'KOGNITYWNE_PROCESY'
|
39
|
+
end
|
40
|
+
|
41
|
+
def sense?
|
42
|
+
path.include? 'ZMYSLY'
|
43
|
+
end
|
44
|
+
|
45
|
+
def social?
|
46
|
+
path.include? 'SOCIAL'
|
47
|
+
end
|
48
|
+
|
49
|
+
def bad_word?
|
50
|
+
path.include? 'WULGAR'
|
51
|
+
end
|
52
|
+
|
53
|
+
end
|
54
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
require 'ds'
|
2
|
+
|
3
|
+
module NLP
|
4
|
+
|
5
|
+
include DS
|
6
|
+
|
7
|
+
class PlTrie < Trie
|
8
|
+
|
9
|
+
ALPHABET = %w{- a ą b c ć d e ę f g h i j k l ł m n ń o ó p r s ś t u v w x y z ź ż} << ' '
|
10
|
+
|
11
|
+
#private
|
12
|
+
def priv_insert(s, value)
|
13
|
+
if s.empty?
|
14
|
+
if @data.nil?
|
15
|
+
@data = [value]
|
16
|
+
else
|
17
|
+
@data.push value
|
18
|
+
end
|
19
|
+
else
|
20
|
+
index = key(s.first)
|
21
|
+
subtree = if @children[index]
|
22
|
+
@children[index]
|
23
|
+
else
|
24
|
+
@children[index] = PlTrie.new
|
25
|
+
end
|
26
|
+
|
27
|
+
subtree.priv_insert(s[1..-1], value)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
module NLP
|
2
|
+
class RIDCategory < Category
|
3
|
+
|
4
|
+
def self.top_level
|
5
|
+
[new(:PIERWOTNE),new(:WTORNE),new(:EMOCJE)]
|
6
|
+
end
|
7
|
+
|
8
|
+
def primary?
|
9
|
+
root == :PIERWOTNE
|
10
|
+
end
|
11
|
+
|
12
|
+
def secondary?
|
13
|
+
root == :WTORNE
|
14
|
+
end
|
15
|
+
|
16
|
+
def emotions?
|
17
|
+
root == :EMOCJE
|
18
|
+
end
|
19
|
+
|
20
|
+
end
|
21
|
+
end
|
data/lib/nlp.rb
CHANGED
@@ -0,0 +1,59 @@
|
|
1
|
+
module Inflectable
|
2
|
+
|
3
|
+
GRAM_CAT = {
|
4
|
+
#rzeczownik
|
5
|
+
:adj => 'przymiotnik',
|
6
|
+
[:subst,:depr] => 'rzeczownik',
|
7
|
+
:adv => 'przyslowek',
|
8
|
+
:num => 'liczebnik',
|
9
|
+
[:pron,:siebie] => 'zaimek',
|
10
|
+
:prep => 'przyimek',
|
11
|
+
#liczby
|
12
|
+
:sg => 'liczba_pojedyncza',
|
13
|
+
:pl => 'liczba_mnoga',
|
14
|
+
|
15
|
+
#Przypadki
|
16
|
+
:nom => 'mianownik',
|
17
|
+
:gen => 'dopelniacz',
|
18
|
+
:dat => 'celownik',
|
19
|
+
:acc => 'biernik',
|
20
|
+
:inst => 'narzednik',
|
21
|
+
:loc => 'miejscownik',
|
22
|
+
:voc => 'wolacz',
|
23
|
+
|
24
|
+
#Rodzaje
|
25
|
+
:m1 => 'meski_osobowy',
|
26
|
+
:m2 => 'meski_zwierzecy',
|
27
|
+
:m3 => 'meski_rzeczowy',
|
28
|
+
:f => 'zenski',
|
29
|
+
:n1 => 'nijaki_zbiorowy',
|
30
|
+
:n2 => 'nijaki zwykly',
|
31
|
+
:p1 => 'przymnogi_osobowy',
|
32
|
+
:p2 => 'przymnogi_zwykly',
|
33
|
+
:p3 => 'przymnogi_opisowy',
|
34
|
+
|
35
|
+
#Osoby
|
36
|
+
:pri => "pierwsza_osoba",
|
37
|
+
:sec => "druga_osoba",
|
38
|
+
:ter => "trzecia_osoba",
|
39
|
+
|
40
|
+
#Stopień
|
41
|
+
:pos => "stopien_rowny",
|
42
|
+
:comp => "stopien_wyzszy",
|
43
|
+
:sup => "stopien_najwyzszy"
|
44
|
+
}
|
45
|
+
|
46
|
+
GRAM_CAT.each do |key,value|
|
47
|
+
|
48
|
+
define_method(value+"?"){
|
49
|
+
inflection.split(":").any?{|e|
|
50
|
+
if key.is_a? Array
|
51
|
+
key.any?{|k| e.include? k.to_s}
|
52
|
+
else
|
53
|
+
e.include? key.to_s
|
54
|
+
end
|
55
|
+
}
|
56
|
+
}
|
57
|
+
end
|
58
|
+
|
59
|
+
end
|
@@ -0,0 +1,112 @@
|
|
1
|
+
require 'rexml/document'
|
2
|
+
|
3
|
+
module NLP
|
4
|
+
class Lemmatizer
|
5
|
+
|
6
|
+
include REXML
|
7
|
+
|
8
|
+
def self.lemmatize(text,method=nil,input_type=nil)
|
9
|
+
if text.is_a? File
|
10
|
+
str = text.read
|
11
|
+
text.close
|
12
|
+
elsif text.is_a? String
|
13
|
+
str = text
|
14
|
+
else
|
15
|
+
raise ArgumentError, "Argument is not String or File"
|
16
|
+
end
|
17
|
+
|
18
|
+
if method === :takipi
|
19
|
+
takipi_lemmatize(str,input_type)
|
20
|
+
|
21
|
+
#Default lematization method is Morfeusz
|
22
|
+
else
|
23
|
+
takipi_lemmatize(str,:remote)
|
24
|
+
|
25
|
+
#morfeusz_lemmatize(str)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
|
30
|
+
|
31
|
+
def self.takipi_lemmatize(text,method)
|
32
|
+
|
33
|
+
if method === :local
|
34
|
+
|
35
|
+
xml_file = TAKIPI_XML_FILE
|
36
|
+
|
37
|
+
t1 = Thread.new do
|
38
|
+
`echo '#{text}' > /tmp/text.txt; takipi -i /tmp/text.txt -o #{xml_file} -it TXT`
|
39
|
+
end
|
40
|
+
|
41
|
+
t1.join
|
42
|
+
|
43
|
+
f = File.open(xml_file,"r")
|
44
|
+
doc = Document.new f
|
45
|
+
|
46
|
+
elsif method === :remote
|
47
|
+
xml = TakipiWebService.request(text)
|
48
|
+
doc = Document.new xml
|
49
|
+
else
|
50
|
+
raise ArgumentError, 'Argument is not :local or :remote'
|
51
|
+
end
|
52
|
+
|
53
|
+
parse_lemmatized_xml(doc)
|
54
|
+
end
|
55
|
+
|
56
|
+
|
57
|
+
def self.morfeusz_lemmatize(text)
|
58
|
+
temp_text = Text.new
|
59
|
+
|
60
|
+
#simple tagger
|
61
|
+
#TODO lemmatizer should take TokenScanner object that defines
|
62
|
+
#how split string
|
63
|
+
# text.split(/\.|!|\?/).each do |s|
|
64
|
+
# sentence = Sentence.new
|
65
|
+
# sentence << s.split(" ").collect{ |t|
|
66
|
+
# if word = Morfeusz::Lexeme.find(t)
|
67
|
+
# if word[0]
|
68
|
+
# Word.new(t,word[0].base_form,"")
|
69
|
+
# else
|
70
|
+
# Word.new(t,"","")
|
71
|
+
# end
|
72
|
+
# else
|
73
|
+
# Word.new(t,"","")
|
74
|
+
# end
|
75
|
+
# }
|
76
|
+
# temp_text << sentence
|
77
|
+
# end
|
78
|
+
temp_text
|
79
|
+
end
|
80
|
+
|
81
|
+
|
82
|
+
def self.parse_lemmatized_xml(doc)
|
83
|
+
|
84
|
+
text = Text.new
|
85
|
+
|
86
|
+
doc.elements.each("*/chunkList/chunk") do |chunk|
|
87
|
+
sentence = Sentence.new
|
88
|
+
tokens = []
|
89
|
+
|
90
|
+
chunk.elements.each("tok") do |tok|
|
91
|
+
word = tok.elements[1].text
|
92
|
+
lemat, inflect = ""
|
93
|
+
|
94
|
+
tok.elements.each("lex") do |lex|
|
95
|
+
if lex.has_attributes?
|
96
|
+
lemat = lex.elements[1].text
|
97
|
+
inflect = lex.elements[2].text
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
tokens << Word.new(word,lemat,inflect)
|
102
|
+
end
|
103
|
+
|
104
|
+
sentence << tokens
|
105
|
+
text << sentence
|
106
|
+
end
|
107
|
+
text
|
108
|
+
end
|
109
|
+
|
110
|
+
|
111
|
+
end
|
112
|
+
end
|
@@ -0,0 +1,63 @@
|
|
1
|
+
module Meaningable
|
2
|
+
|
3
|
+
#LIWC
|
4
|
+
#primary categories
|
5
|
+
|
6
|
+
def linguistic?
|
7
|
+
category.root == :PIERWOTNE
|
8
|
+
end
|
9
|
+
|
10
|
+
def psychological?
|
11
|
+
category.root == :PROCESY_PSYCHOLOGICZNE
|
12
|
+
end
|
13
|
+
|
14
|
+
|
15
|
+
def relative?
|
16
|
+
category.root === :RELATYWNOSC
|
17
|
+
end
|
18
|
+
|
19
|
+
def personal?
|
20
|
+
category.root == :OSOBISTE
|
21
|
+
end
|
22
|
+
|
23
|
+
#second categories
|
24
|
+
|
25
|
+
def emotion?
|
26
|
+
category.path.include? 'EMOCJE'
|
27
|
+
end
|
28
|
+
|
29
|
+
def positive_emotion?
|
30
|
+
category.path.include? 'POZYTYWNE_EMOCJE'
|
31
|
+
end
|
32
|
+
|
33
|
+
def negative_emotion?
|
34
|
+
category.path.include? 'NEGATYWNE_EMOCJE'
|
35
|
+
end
|
36
|
+
|
37
|
+
def cognitive?
|
38
|
+
category.path.include? 'KOGNITYWNE_PROCESY'
|
39
|
+
end
|
40
|
+
|
41
|
+
def sense?
|
42
|
+
category.path.include? 'ZMYSLY'
|
43
|
+
end
|
44
|
+
|
45
|
+
def social?
|
46
|
+
category.path.include? 'SOCIAL'
|
47
|
+
end
|
48
|
+
|
49
|
+
def bad_word?
|
50
|
+
category.path.include? 'WULGAR'
|
51
|
+
end
|
52
|
+
|
53
|
+
|
54
|
+
#SEMANTIC
|
55
|
+
def synonym?(other)
|
56
|
+
|
57
|
+
end
|
58
|
+
|
59
|
+
def synonyms
|
60
|
+
|
61
|
+
end
|
62
|
+
|
63
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
module NLP
|
2
|
+
class Sentence
|
3
|
+
|
4
|
+
attr_reader :tokens
|
5
|
+
|
6
|
+
def initialize()
|
7
|
+
@tokens = []
|
8
|
+
end
|
9
|
+
|
10
|
+
def << tokens
|
11
|
+
if tokens.is_a? Array
|
12
|
+
@tokens.concat tokens
|
13
|
+
else
|
14
|
+
@tokens << tokens
|
15
|
+
end
|
16
|
+
self
|
17
|
+
end
|
18
|
+
|
19
|
+
def words_number
|
20
|
+
@tokens.count{|t| !t.interp?}
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'savon'
|
3
|
+
|
4
|
+
class TakipiWebService
|
5
|
+
URL = 'http://nlp.pwr.wroc.pl/clarin/ws/takipi/'
|
6
|
+
WSDL_URL = URL + 'takipi.wsdl'
|
7
|
+
|
8
|
+
def self.request(text)
|
9
|
+
client = Savon::Client.new WSDL_URL, :soap_endpoint => URL
|
10
|
+
|
11
|
+
# Call remote service methods
|
12
|
+
response = client.tag do |soap|
|
13
|
+
soap.body = "<text>#{text}</text><format>TXT</format><useGuesser>true</useGuesser>"
|
14
|
+
end
|
15
|
+
|
16
|
+
response = response.to_hash
|
17
|
+
token = response[:tag_response][:tag_response][:msg]
|
18
|
+
status = (response[:tag_response][:tag_response][:status]).to_i
|
19
|
+
|
20
|
+
#checking status
|
21
|
+
timeout = 60
|
22
|
+
step = 5
|
23
|
+
count = 0
|
24
|
+
loop do
|
25
|
+
break if count > timeout
|
26
|
+
if status == 1
|
27
|
+
break
|
28
|
+
elsif status == 2 or status == 3
|
29
|
+
count += 5
|
30
|
+
sleep(1)
|
31
|
+
r = client.get_status do |soap|
|
32
|
+
soap.body = "<token>#{token}</token>"
|
33
|
+
end.to_hash
|
34
|
+
status = (r[:get_status_response][:status]).to_i
|
35
|
+
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
#geting result
|
40
|
+
|
41
|
+
result = client.get_result do |soap|
|
42
|
+
soap.body="<token>#{token}</token>"
|
43
|
+
end
|
44
|
+
|
45
|
+
response_document = result.to_hash[:get_result_response][:tag_response][:msg]
|
46
|
+
|
47
|
+
#transforming response to well formed xml string
|
48
|
+
return "<xml><chunkList>#{response_document}</chunkList></xml>"
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
data/lib/tagger/text.rb
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
module NLP
|
2
|
+
class Text
|
3
|
+
attr_reader :sentences
|
4
|
+
|
5
|
+
def initialize
|
6
|
+
@sentences = []
|
7
|
+
end
|
8
|
+
|
9
|
+
def << sentence
|
10
|
+
@sentences.push sentence
|
11
|
+
end
|
12
|
+
|
13
|
+
def words_per_sentence
|
14
|
+
@sentences.collect{|s| s.words_number}.mean
|
15
|
+
end
|
16
|
+
|
17
|
+
def flatten
|
18
|
+
flattened = []
|
19
|
+
@sentences.each{ |s| s.tokens.each{|t| flattened.push t } }
|
20
|
+
flattened
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
24
|
+
end
|
data/lib/tagger/token.rb
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
module NLP
|
2
|
+
class Token
|
3
|
+
|
4
|
+
attr_reader :orth
|
5
|
+
attr_reader :tags
|
6
|
+
|
7
|
+
def initialize(orth,tags)
|
8
|
+
@orth = orth
|
9
|
+
@tags = tags
|
10
|
+
end
|
11
|
+
|
12
|
+
def symbol?
|
13
|
+
@tags.eql? "tsym"
|
14
|
+
end
|
15
|
+
|
16
|
+
def interp?
|
17
|
+
@tags.eql? "interp"
|
18
|
+
end
|
19
|
+
|
20
|
+
def word?
|
21
|
+
not interp? and not number? and not agl?
|
22
|
+
end
|
23
|
+
|
24
|
+
def number?
|
25
|
+
@tags.include?("tnum")
|
26
|
+
end
|
27
|
+
|
28
|
+
def integer?
|
29
|
+
@tags.include?("tnum:integer")
|
30
|
+
end
|
31
|
+
|
32
|
+
def float?
|
33
|
+
@tags.include?("tnum:frac")
|
34
|
+
end
|
35
|
+
|
36
|
+
def qublic?
|
37
|
+
@tags.include?("qub")
|
38
|
+
end
|
39
|
+
|
40
|
+
def agl?
|
41
|
+
@tags.include?("agl")
|
42
|
+
end
|
43
|
+
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,58 @@
|
|
1
|
+
module NLP
|
2
|
+
class TokenScanner
|
3
|
+
|
4
|
+
attr_reader :text, :tokens
|
5
|
+
|
6
|
+
def initialize(text)
|
7
|
+
@text = text
|
8
|
+
@pos = 0
|
9
|
+
@tokens = @text.flatten
|
10
|
+
end
|
11
|
+
|
12
|
+
def next(type)
|
13
|
+
@pos+=1
|
14
|
+
|
15
|
+
case type
|
16
|
+
when :word
|
17
|
+
while @pos < @tokens.size and !@tokens[@pos].word?
|
18
|
+
@pos+= 1
|
19
|
+
end
|
20
|
+
|
21
|
+
when :interp
|
22
|
+
while @pos < @tokens.size and !@tokens[@pos].interp?
|
23
|
+
@pos+= 1
|
24
|
+
end
|
25
|
+
|
26
|
+
when :number
|
27
|
+
while @pos < @tokens.size and !@tokens[@pos].number?
|
28
|
+
@pos+= 1
|
29
|
+
end
|
30
|
+
when :alphanum
|
31
|
+
while @pos < @tokens.size and !@tokens[@pos].number? and !@tokens[@pos].word?
|
32
|
+
@pos+= 1
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def current
|
38
|
+
if @pos == @tokens.size
|
39
|
+
nil
|
40
|
+
else
|
41
|
+
@tokens[@pos]
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
def rewind
|
46
|
+
@pos = 0
|
47
|
+
end
|
48
|
+
|
49
|
+
def index
|
50
|
+
@pos
|
51
|
+
end
|
52
|
+
|
53
|
+
def end?
|
54
|
+
@pos == tokens.size
|
55
|
+
end
|
56
|
+
|
57
|
+
end
|
58
|
+
end
|
data/lib/tagger/word.rb
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
module NLP
|
2
|
+
class Word < Token
|
3
|
+
|
4
|
+
include Inflectable
|
5
|
+
include Meaningable
|
6
|
+
|
7
|
+
attr_reader :lemat
|
8
|
+
attr_accessor :category
|
9
|
+
|
10
|
+
def initialize(word, lemat, tags)
|
11
|
+
super(word,tags)
|
12
|
+
@lemat = lemat
|
13
|
+
end
|
14
|
+
|
15
|
+
def inflection
|
16
|
+
@tags
|
17
|
+
end
|
18
|
+
|
19
|
+
end
|
20
|
+
end
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: nlp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 7
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 2
|
9
|
-
-
|
10
|
-
version: 0.2.
|
9
|
+
- 8
|
10
|
+
version: 0.2.8
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- knife
|
@@ -59,9 +59,26 @@ extra_rdoc_files:
|
|
59
59
|
files:
|
60
60
|
- dict/liwc
|
61
61
|
- dict/rid
|
62
|
-
- lib/
|
62
|
+
- lib/analizators/analyzer.rb
|
63
|
+
- lib/analizators/liwc_analyzer.rb
|
64
|
+
- lib/analizators/rid_analyzer.rb
|
65
|
+
- lib/dictionaries/category.rb
|
66
|
+
- lib/dictionaries/dictionary.rb
|
67
|
+
- lib/dictionaries/liwc_category.rb
|
68
|
+
- lib/dictionaries/pl_trie.rb
|
69
|
+
- lib/dictionaries/rid_category.rb
|
63
70
|
- lib/nlp.rb
|
64
71
|
- lib/stdlib/ext/array.rb
|
72
|
+
- lib/tagger/emoticon.rb
|
73
|
+
- lib/tagger/inflectable.rb
|
74
|
+
- lib/tagger/lemmatizer.rb
|
75
|
+
- lib/tagger/meaningable.rb
|
76
|
+
- lib/tagger/sentence.rb
|
77
|
+
- lib/tagger/takipi_web_service.rb
|
78
|
+
- lib/tagger/text.rb
|
79
|
+
- lib/tagger/token.rb
|
80
|
+
- lib/tagger/token_scanner.rb
|
81
|
+
- lib/tagger/word.rb
|
65
82
|
- lib/text_statistics.rb
|
66
83
|
- LICENSE
|
67
84
|
- README.rdoc
|
data/lib/morfeusz.rb
DELETED
@@ -1,69 +0,0 @@
|
|
1
|
-
# Ruby bindings for Morfeusz v. 0.1
|
2
|
-
# Author: Aleksander Pohl
|
3
|
-
# apohllo@o2.pl
|
4
|
-
|
5
|
-
require 'rubygems'
|
6
|
-
require 'inline'
|
7
|
-
require 'singleton'
|
8
|
-
require 'iconv'
|
9
|
-
module NLP
|
10
|
-
module Morfeusz
|
11
|
-
MORFOPT_ENCODING = 1
|
12
|
-
MORFEUSZ_UTF_8 = 8
|
13
|
-
class Morfeusz
|
14
|
-
include Singleton
|
15
|
-
|
16
|
-
inline(:C) do |builder|
|
17
|
-
builder.include '"morfeusz.h"'
|
18
|
-
builder.add_compile_flags '-lmorfeusz', '-I/home/knife/morf/include/'
|
19
|
-
builder.c <<-END
|
20
|
-
void initialize(){
|
21
|
-
morfeusz_set_option(#{MORFOPT_ENCODING},#{MORFEUSZ_UTF_8});
|
22
|
-
}
|
23
|
-
END
|
24
|
-
|
25
|
-
builder.c <<-END
|
26
|
-
char * about(){
|
27
|
-
return morfeusz_about();
|
28
|
-
}
|
29
|
-
END
|
30
|
-
|
31
|
-
builder.c <<-END
|
32
|
-
VALUE _base(VALUE str){
|
33
|
-
char * p;
|
34
|
-
int index = 0;
|
35
|
-
VALUE arr = rb_ary_new();
|
36
|
-
int id_push = rb_intern("push");
|
37
|
-
p = StringValuePtr(str);
|
38
|
-
InterpMorf* result = morfeusz_analyse(p);
|
39
|
-
InterpMorf el;
|
40
|
-
while((el = result[index++]).k != -1){
|
41
|
-
if(el.haslo != NULL){
|
42
|
-
rb_funcall(arr,id_push,1,rb_str_new2(el.haslo));
|
43
|
-
}
|
44
|
-
}
|
45
|
-
return arr;
|
46
|
-
}
|
47
|
-
END
|
48
|
-
|
49
|
-
def base(word)
|
50
|
-
# _base(word)
|
51
|
-
_base(word).collect{|e| e}
|
52
|
-
end
|
53
|
-
|
54
|
-
end
|
55
|
-
end
|
56
|
-
|
57
|
-
class Lexeme
|
58
|
-
attr_reader :base_form
|
59
|
-
def initialize(base_form)
|
60
|
-
@base_form = base_form
|
61
|
-
end
|
62
|
-
|
63
|
-
def self.find(word)
|
64
|
-
Morfeusz.instance.base(word).collect{|bf| Lexeme.new(bf)}
|
65
|
-
end
|
66
|
-
|
67
|
-
end
|
68
|
-
end
|
69
|
-
end
|