nlp 0.2.7 → 0.2.8
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/analizators/analyzer.rb +28 -0
- data/lib/analizators/liwc_analyzer.rb +68 -0
- data/lib/analizators/rid_analyzer.rb +10 -0
- data/lib/dictionaries/category.rb +27 -0
- data/lib/dictionaries/dictionary.rb +76 -0
- data/lib/dictionaries/liwc_category.rb +54 -0
- data/lib/dictionaries/pl_trie.rb +31 -0
- data/lib/dictionaries/rid_category.rb +21 -0
- data/lib/nlp.rb +0 -1
- data/lib/tagger/emoticon.rb +13 -0
- data/lib/tagger/inflectable.rb +59 -0
- data/lib/tagger/lemmatizer.rb +112 -0
- data/lib/tagger/meaningable.rb +63 -0
- data/lib/tagger/sentence.rb +24 -0
- data/lib/tagger/takipi_web_service.rb +51 -0
- data/lib/tagger/text.rb +24 -0
- data/lib/tagger/token.rb +45 -0
- data/lib/tagger/token_scanner.rb +58 -0
- data/lib/tagger/word.rb +20 -0
- metadata +21 -4
- data/lib/morfeusz.rb +0 -69
@@ -0,0 +1,28 @@
|
|
1
|
+
module NLP
|
2
|
+
|
3
|
+
class Analyzer
|
4
|
+
|
5
|
+
def initialize(dict)
|
6
|
+
@dictionary = Dictionary.new(dict)
|
7
|
+
end
|
8
|
+
|
9
|
+
|
10
|
+
def analyze(scanner)
|
11
|
+
|
12
|
+
results = TextStatistics.new
|
13
|
+
|
14
|
+
while token = scanner.current
|
15
|
+
word = token.lemat
|
16
|
+
|
17
|
+
categories = @dictionary.find(word)
|
18
|
+
results.add(word,categories) unless categories.nil?
|
19
|
+
results.total_words += 1
|
20
|
+
scanner.next(:word)
|
21
|
+
end
|
22
|
+
|
23
|
+
results
|
24
|
+
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
@@ -0,0 +1,68 @@
|
|
1
|
+
module NLP
|
2
|
+
class LIWCAnalyzer < Analyzer
|
3
|
+
|
4
|
+
def initialize
|
5
|
+
@dictionary = Dictionary.new(:liwc)
|
6
|
+
end
|
7
|
+
|
8
|
+
|
9
|
+
def analyze(scanner)
|
10
|
+
|
11
|
+
results = TextStatistics.new
|
12
|
+
results.hash = {
|
13
|
+
:long_words => [],
|
14
|
+
:zaimki => [],
|
15
|
+
:zaimki1 => [],
|
16
|
+
:zaimki2 => [],
|
17
|
+
:zaimki3 => [],
|
18
|
+
:przyimki => [],
|
19
|
+
:numbers => [],
|
20
|
+
:emotion => [],
|
21
|
+
:social => [],
|
22
|
+
:personal => [],
|
23
|
+
:posemotion => [],
|
24
|
+
:negemotion => [],
|
25
|
+
:wulgar => [],
|
26
|
+
:cognitive => []
|
27
|
+
}
|
28
|
+
|
29
|
+
while token = scanner.current
|
30
|
+
word = token.lemat
|
31
|
+
categories = @dictionary.find(word.gsub( /[^\w-]/, "" ))
|
32
|
+
|
33
|
+
unless categories.nil?
|
34
|
+
results.add(word,categories)
|
35
|
+
token.category = categories.first
|
36
|
+
|
37
|
+
results[:emotion].push token.orth if token.emotion?
|
38
|
+
results[:social].push token.orth if token.social?
|
39
|
+
results[:personal].push token.orth if token.personal?
|
40
|
+
results[:wulgar].push token.orth if token.bad_word?
|
41
|
+
results[:cognitive].push token.orth if token.cognitive?
|
42
|
+
|
43
|
+
results[:posemotion].push token.orth if token.positive_emotion?
|
44
|
+
results[:negemotion].push token.orth if token.negative_emotion?
|
45
|
+
end
|
46
|
+
#words longer than 10
|
47
|
+
results[:long_words].push word if word.jlength > 10
|
48
|
+
if token.zaimek?
|
49
|
+
results[:zaimki].push word
|
50
|
+
|
51
|
+
results[:zaimki1].push token.orth if word === 'ja' or word === 'my'
|
52
|
+
results[:zaimki2].push token.orth if word === 'ty' or word === 'wy'
|
53
|
+
results[:zaimki3].push token.orth if word === 'on'
|
54
|
+
end
|
55
|
+
|
56
|
+
results[:przyimki].push word if token.przyimek?
|
57
|
+
results[:numbers].push token.orth if token.number? or token.liczebnik?
|
58
|
+
|
59
|
+
results.total_words += 1
|
60
|
+
scanner.next(:alphanum)
|
61
|
+
end
|
62
|
+
results
|
63
|
+
|
64
|
+
end
|
65
|
+
|
66
|
+
end
|
67
|
+
|
68
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
module NLP
|
2
|
+
class Category
|
3
|
+
attr_reader :parent, :name
|
4
|
+
|
5
|
+
def initialize(name, parent = nil)
|
6
|
+
@parent = parent
|
7
|
+
@name = name.to_sym
|
8
|
+
end
|
9
|
+
|
10
|
+
def path
|
11
|
+
@parent ? (@parent.path + '/' + name.to_s) : name.to_s
|
12
|
+
end
|
13
|
+
|
14
|
+
def root
|
15
|
+
category = self
|
16
|
+
while category.parent != nil
|
17
|
+
category = category.parent
|
18
|
+
end
|
19
|
+
category.name
|
20
|
+
end
|
21
|
+
|
22
|
+
def to_s
|
23
|
+
"#{path.inspect}"
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,76 @@
|
|
1
|
+
module NLP
|
2
|
+
class Dictionary
|
3
|
+
|
4
|
+
attr_accessor :tree
|
5
|
+
|
6
|
+
def initialize(category_file=:rid,restore = true)
|
7
|
+
state_file = File.expand_path(DICTIONARY_CACHE_DIR+".#{category_file.to_s}")
|
8
|
+
if restore and File.exist?(state_file)
|
9
|
+
@tree = Dictionary.restore(state_file)
|
10
|
+
else
|
11
|
+
@tree = PlTrie.new
|
12
|
+
load_categories(File.dirname(__FILE__)+"/../../dict/#{category_file.to_s}", category_file )
|
13
|
+
store(state_file)
|
14
|
+
end
|
15
|
+
|
16
|
+
end
|
17
|
+
|
18
|
+
def store( state_file )
|
19
|
+
File.open( File.expand_path( state_file ), "w" ) do |file|
|
20
|
+
Marshal.dump( self.tree, file )
|
21
|
+
end
|
22
|
+
self
|
23
|
+
end
|
24
|
+
|
25
|
+
def self.restore( state_file )
|
26
|
+
File.open( File.expand_path( state_file ) ) do |file|
|
27
|
+
Marshal.restore( file )
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def find(word)
|
32
|
+
begin
|
33
|
+
@tree.find(word)
|
34
|
+
rescue
|
35
|
+
nil
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def load_categories(category_file,type)
|
40
|
+
category = nil
|
41
|
+
primary = nil
|
42
|
+
secondary = nil
|
43
|
+
tertiary = nil
|
44
|
+
|
45
|
+
if type == :rid
|
46
|
+
cat_class = NLP.const_get("RIDCategory")
|
47
|
+
else
|
48
|
+
cat_class = NLP.const_get("LIWCCategory")
|
49
|
+
end
|
50
|
+
|
51
|
+
File.open(category_file) do |file|
|
52
|
+
while line = file.gets
|
53
|
+
line.chomp!
|
54
|
+
begin
|
55
|
+
lead, rest = line.scan(/(\t*)(.*)/).first
|
56
|
+
if lead.size == 0
|
57
|
+
category = primary = cat_class.new(rest)
|
58
|
+
secondary, tertiary = nil
|
59
|
+
elsif lead.size == 1
|
60
|
+
category = secondary = cat_class.new(rest, primary)
|
61
|
+
tertiary = nil
|
62
|
+
elsif lead.size == 2 && ( cat = line.strip.index(/^[A-ZĄŚĘĆŃŹŻŁÓ_]+$/)) && cat >= 0
|
63
|
+
category = tertiary = cat_class.new( rest, secondary )
|
64
|
+
else
|
65
|
+
word = rest.downcase.gsub( /\s*\(1\)$/, '' )
|
66
|
+
@tree.insert(word, category)
|
67
|
+
end
|
68
|
+
rescue
|
69
|
+
raise
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
@@ -0,0 +1,54 @@
|
|
1
|
+
module NLP
|
2
|
+
class LIWCCategory < Category
|
3
|
+
|
4
|
+
#primary categories
|
5
|
+
|
6
|
+
def linguistic?
|
7
|
+
root == :PIERWOTNE
|
8
|
+
end
|
9
|
+
|
10
|
+
def psychological?
|
11
|
+
root == :PROCESY_PSYCHOLOGICZNE
|
12
|
+
end
|
13
|
+
|
14
|
+
|
15
|
+
def relative?
|
16
|
+
root === :RELATYWNOSC
|
17
|
+
end
|
18
|
+
|
19
|
+
def personal?
|
20
|
+
root == :OSOBISTE
|
21
|
+
end
|
22
|
+
|
23
|
+
#second categories
|
24
|
+
|
25
|
+
def emotion?
|
26
|
+
path.include? 'EMOCJE'
|
27
|
+
end
|
28
|
+
|
29
|
+
def positive_emotion?
|
30
|
+
path.include? 'POZYTYWNE_EMOCJE'
|
31
|
+
end
|
32
|
+
|
33
|
+
def negative_emotion?
|
34
|
+
path.include? 'NEGATYWNE_EMOCJE'
|
35
|
+
end
|
36
|
+
|
37
|
+
def cognitive?
|
38
|
+
path.include? 'KOGNITYWNE_PROCESY'
|
39
|
+
end
|
40
|
+
|
41
|
+
def sense?
|
42
|
+
path.include? 'ZMYSLY'
|
43
|
+
end
|
44
|
+
|
45
|
+
def social?
|
46
|
+
path.include? 'SOCIAL'
|
47
|
+
end
|
48
|
+
|
49
|
+
def bad_word?
|
50
|
+
path.include? 'WULGAR'
|
51
|
+
end
|
52
|
+
|
53
|
+
end
|
54
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
require 'ds'
|
2
|
+
|
3
|
+
module NLP
|
4
|
+
|
5
|
+
include DS
|
6
|
+
|
7
|
+
class PlTrie < Trie
|
8
|
+
|
9
|
+
ALPHABET = %w{- a ą b c ć d e ę f g h i j k l ł m n ń o ó p r s ś t u v w x y z ź ż} << ' '
|
10
|
+
|
11
|
+
#private
|
12
|
+
def priv_insert(s, value)
|
13
|
+
if s.empty?
|
14
|
+
if @data.nil?
|
15
|
+
@data = [value]
|
16
|
+
else
|
17
|
+
@data.push value
|
18
|
+
end
|
19
|
+
else
|
20
|
+
index = key(s.first)
|
21
|
+
subtree = if @children[index]
|
22
|
+
@children[index]
|
23
|
+
else
|
24
|
+
@children[index] = PlTrie.new
|
25
|
+
end
|
26
|
+
|
27
|
+
subtree.priv_insert(s[1..-1], value)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
module NLP
|
2
|
+
class RIDCategory < Category
|
3
|
+
|
4
|
+
def self.top_level
|
5
|
+
[new(:PIERWOTNE),new(:WTORNE),new(:EMOCJE)]
|
6
|
+
end
|
7
|
+
|
8
|
+
def primary?
|
9
|
+
root == :PIERWOTNE
|
10
|
+
end
|
11
|
+
|
12
|
+
def secondary?
|
13
|
+
root == :WTORNE
|
14
|
+
end
|
15
|
+
|
16
|
+
def emotions?
|
17
|
+
root == :EMOCJE
|
18
|
+
end
|
19
|
+
|
20
|
+
end
|
21
|
+
end
|
data/lib/nlp.rb
CHANGED
@@ -0,0 +1,59 @@
|
|
1
|
+
module Inflectable
|
2
|
+
|
3
|
+
GRAM_CAT = {
|
4
|
+
#rzeczownik
|
5
|
+
:adj => 'przymiotnik',
|
6
|
+
[:subst,:depr] => 'rzeczownik',
|
7
|
+
:adv => 'przyslowek',
|
8
|
+
:num => 'liczebnik',
|
9
|
+
[:pron,:siebie] => 'zaimek',
|
10
|
+
:prep => 'przyimek',
|
11
|
+
#liczby
|
12
|
+
:sg => 'liczba_pojedyncza',
|
13
|
+
:pl => 'liczba_mnoga',
|
14
|
+
|
15
|
+
#Przypadki
|
16
|
+
:nom => 'mianownik',
|
17
|
+
:gen => 'dopelniacz',
|
18
|
+
:dat => 'celownik',
|
19
|
+
:acc => 'biernik',
|
20
|
+
:inst => 'narzednik',
|
21
|
+
:loc => 'miejscownik',
|
22
|
+
:voc => 'wolacz',
|
23
|
+
|
24
|
+
#Rodzaje
|
25
|
+
:m1 => 'meski_osobowy',
|
26
|
+
:m2 => 'meski_zwierzecy',
|
27
|
+
:m3 => 'meski_rzeczowy',
|
28
|
+
:f => 'zenski',
|
29
|
+
:n1 => 'nijaki_zbiorowy',
|
30
|
+
:n2 => 'nijaki zwykly',
|
31
|
+
:p1 => 'przymnogi_osobowy',
|
32
|
+
:p2 => 'przymnogi_zwykly',
|
33
|
+
:p3 => 'przymnogi_opisowy',
|
34
|
+
|
35
|
+
#Osoby
|
36
|
+
:pri => "pierwsza_osoba",
|
37
|
+
:sec => "druga_osoba",
|
38
|
+
:ter => "trzecia_osoba",
|
39
|
+
|
40
|
+
#Stopień
|
41
|
+
:pos => "stopien_rowny",
|
42
|
+
:comp => "stopien_wyzszy",
|
43
|
+
:sup => "stopien_najwyzszy"
|
44
|
+
}
|
45
|
+
|
46
|
+
GRAM_CAT.each do |key,value|
|
47
|
+
|
48
|
+
define_method(value+"?"){
|
49
|
+
inflection.split(":").any?{|e|
|
50
|
+
if key.is_a? Array
|
51
|
+
key.any?{|k| e.include? k.to_s}
|
52
|
+
else
|
53
|
+
e.include? key.to_s
|
54
|
+
end
|
55
|
+
}
|
56
|
+
}
|
57
|
+
end
|
58
|
+
|
59
|
+
end
|
@@ -0,0 +1,112 @@
|
|
1
|
+
require 'rexml/document'
|
2
|
+
|
3
|
+
module NLP
|
4
|
+
class Lemmatizer
|
5
|
+
|
6
|
+
include REXML
|
7
|
+
|
8
|
+
def self.lemmatize(text,method=nil,input_type=nil)
|
9
|
+
if text.is_a? File
|
10
|
+
str = text.read
|
11
|
+
text.close
|
12
|
+
elsif text.is_a? String
|
13
|
+
str = text
|
14
|
+
else
|
15
|
+
raise ArgumentError, "Argument is not String or File"
|
16
|
+
end
|
17
|
+
|
18
|
+
if method === :takipi
|
19
|
+
takipi_lemmatize(str,input_type)
|
20
|
+
|
21
|
+
#Default lematization method is Morfeusz
|
22
|
+
else
|
23
|
+
takipi_lemmatize(str,:remote)
|
24
|
+
|
25
|
+
#morfeusz_lemmatize(str)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
|
30
|
+
|
31
|
+
def self.takipi_lemmatize(text,method)
|
32
|
+
|
33
|
+
if method === :local
|
34
|
+
|
35
|
+
xml_file = TAKIPI_XML_FILE
|
36
|
+
|
37
|
+
t1 = Thread.new do
|
38
|
+
`echo '#{text}' > /tmp/text.txt; takipi -i /tmp/text.txt -o #{xml_file} -it TXT`
|
39
|
+
end
|
40
|
+
|
41
|
+
t1.join
|
42
|
+
|
43
|
+
f = File.open(xml_file,"r")
|
44
|
+
doc = Document.new f
|
45
|
+
|
46
|
+
elsif method === :remote
|
47
|
+
xml = TakipiWebService.request(text)
|
48
|
+
doc = Document.new xml
|
49
|
+
else
|
50
|
+
raise ArgumentError, 'Argument is not :local or :remote'
|
51
|
+
end
|
52
|
+
|
53
|
+
parse_lemmatized_xml(doc)
|
54
|
+
end
|
55
|
+
|
56
|
+
|
57
|
+
def self.morfeusz_lemmatize(text)
|
58
|
+
temp_text = Text.new
|
59
|
+
|
60
|
+
#simple tagger
|
61
|
+
#TODO lemmatizer should take TokenScanner object that defines
|
62
|
+
#how split string
|
63
|
+
# text.split(/\.|!|\?/).each do |s|
|
64
|
+
# sentence = Sentence.new
|
65
|
+
# sentence << s.split(" ").collect{ |t|
|
66
|
+
# if word = Morfeusz::Lexeme.find(t)
|
67
|
+
# if word[0]
|
68
|
+
# Word.new(t,word[0].base_form,"")
|
69
|
+
# else
|
70
|
+
# Word.new(t,"","")
|
71
|
+
# end
|
72
|
+
# else
|
73
|
+
# Word.new(t,"","")
|
74
|
+
# end
|
75
|
+
# }
|
76
|
+
# temp_text << sentence
|
77
|
+
# end
|
78
|
+
temp_text
|
79
|
+
end
|
80
|
+
|
81
|
+
|
82
|
+
def self.parse_lemmatized_xml(doc)
|
83
|
+
|
84
|
+
text = Text.new
|
85
|
+
|
86
|
+
doc.elements.each("*/chunkList/chunk") do |chunk|
|
87
|
+
sentence = Sentence.new
|
88
|
+
tokens = []
|
89
|
+
|
90
|
+
chunk.elements.each("tok") do |tok|
|
91
|
+
word = tok.elements[1].text
|
92
|
+
lemat, inflect = ""
|
93
|
+
|
94
|
+
tok.elements.each("lex") do |lex|
|
95
|
+
if lex.has_attributes?
|
96
|
+
lemat = lex.elements[1].text
|
97
|
+
inflect = lex.elements[2].text
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
tokens << Word.new(word,lemat,inflect)
|
102
|
+
end
|
103
|
+
|
104
|
+
sentence << tokens
|
105
|
+
text << sentence
|
106
|
+
end
|
107
|
+
text
|
108
|
+
end
|
109
|
+
|
110
|
+
|
111
|
+
end
|
112
|
+
end
|
@@ -0,0 +1,63 @@
|
|
1
|
+
module Meaningable
|
2
|
+
|
3
|
+
#LIWC
|
4
|
+
#primary categories
|
5
|
+
|
6
|
+
def linguistic?
|
7
|
+
category.root == :PIERWOTNE
|
8
|
+
end
|
9
|
+
|
10
|
+
def psychological?
|
11
|
+
category.root == :PROCESY_PSYCHOLOGICZNE
|
12
|
+
end
|
13
|
+
|
14
|
+
|
15
|
+
def relative?
|
16
|
+
category.root === :RELATYWNOSC
|
17
|
+
end
|
18
|
+
|
19
|
+
def personal?
|
20
|
+
category.root == :OSOBISTE
|
21
|
+
end
|
22
|
+
|
23
|
+
#second categories
|
24
|
+
|
25
|
+
def emotion?
|
26
|
+
category.path.include? 'EMOCJE'
|
27
|
+
end
|
28
|
+
|
29
|
+
def positive_emotion?
|
30
|
+
category.path.include? 'POZYTYWNE_EMOCJE'
|
31
|
+
end
|
32
|
+
|
33
|
+
def negative_emotion?
|
34
|
+
category.path.include? 'NEGATYWNE_EMOCJE'
|
35
|
+
end
|
36
|
+
|
37
|
+
def cognitive?
|
38
|
+
category.path.include? 'KOGNITYWNE_PROCESY'
|
39
|
+
end
|
40
|
+
|
41
|
+
def sense?
|
42
|
+
category.path.include? 'ZMYSLY'
|
43
|
+
end
|
44
|
+
|
45
|
+
def social?
|
46
|
+
category.path.include? 'SOCIAL'
|
47
|
+
end
|
48
|
+
|
49
|
+
def bad_word?
|
50
|
+
category.path.include? 'WULGAR'
|
51
|
+
end
|
52
|
+
|
53
|
+
|
54
|
+
#SEMANTIC
|
55
|
+
def synonym?(other)
|
56
|
+
|
57
|
+
end
|
58
|
+
|
59
|
+
def synonyms
|
60
|
+
|
61
|
+
end
|
62
|
+
|
63
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
module NLP
|
2
|
+
class Sentence
|
3
|
+
|
4
|
+
attr_reader :tokens
|
5
|
+
|
6
|
+
def initialize()
|
7
|
+
@tokens = []
|
8
|
+
end
|
9
|
+
|
10
|
+
def << tokens
|
11
|
+
if tokens.is_a? Array
|
12
|
+
@tokens.concat tokens
|
13
|
+
else
|
14
|
+
@tokens << tokens
|
15
|
+
end
|
16
|
+
self
|
17
|
+
end
|
18
|
+
|
19
|
+
def words_number
|
20
|
+
@tokens.count{|t| !t.interp?}
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'savon'
|
3
|
+
|
4
|
+
class TakipiWebService
|
5
|
+
URL = 'http://nlp.pwr.wroc.pl/clarin/ws/takipi/'
|
6
|
+
WSDL_URL = URL + 'takipi.wsdl'
|
7
|
+
|
8
|
+
def self.request(text)
|
9
|
+
client = Savon::Client.new WSDL_URL, :soap_endpoint => URL
|
10
|
+
|
11
|
+
# Call remote service methods
|
12
|
+
response = client.tag do |soap|
|
13
|
+
soap.body = "<text>#{text}</text><format>TXT</format><useGuesser>true</useGuesser>"
|
14
|
+
end
|
15
|
+
|
16
|
+
response = response.to_hash
|
17
|
+
token = response[:tag_response][:tag_response][:msg]
|
18
|
+
status = (response[:tag_response][:tag_response][:status]).to_i
|
19
|
+
|
20
|
+
#checking status
|
21
|
+
timeout = 60
|
22
|
+
step = 5
|
23
|
+
count = 0
|
24
|
+
loop do
|
25
|
+
break if count > timeout
|
26
|
+
if status == 1
|
27
|
+
break
|
28
|
+
elsif status == 2 or status == 3
|
29
|
+
count += 5
|
30
|
+
sleep(1)
|
31
|
+
r = client.get_status do |soap|
|
32
|
+
soap.body = "<token>#{token}</token>"
|
33
|
+
end.to_hash
|
34
|
+
status = (r[:get_status_response][:status]).to_i
|
35
|
+
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
#geting result
|
40
|
+
|
41
|
+
result = client.get_result do |soap|
|
42
|
+
soap.body="<token>#{token}</token>"
|
43
|
+
end
|
44
|
+
|
45
|
+
response_document = result.to_hash[:get_result_response][:tag_response][:msg]
|
46
|
+
|
47
|
+
#transforming response to well formed xml string
|
48
|
+
return "<xml><chunkList>#{response_document}</chunkList></xml>"
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
data/lib/tagger/text.rb
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
module NLP
|
2
|
+
class Text
|
3
|
+
attr_reader :sentences
|
4
|
+
|
5
|
+
def initialize
|
6
|
+
@sentences = []
|
7
|
+
end
|
8
|
+
|
9
|
+
def << sentence
|
10
|
+
@sentences.push sentence
|
11
|
+
end
|
12
|
+
|
13
|
+
def words_per_sentence
|
14
|
+
@sentences.collect{|s| s.words_number}.mean
|
15
|
+
end
|
16
|
+
|
17
|
+
def flatten
|
18
|
+
flattened = []
|
19
|
+
@sentences.each{ |s| s.tokens.each{|t| flattened.push t } }
|
20
|
+
flattened
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
24
|
+
end
|
data/lib/tagger/token.rb
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
module NLP
|
2
|
+
class Token
|
3
|
+
|
4
|
+
attr_reader :orth
|
5
|
+
attr_reader :tags
|
6
|
+
|
7
|
+
def initialize(orth,tags)
|
8
|
+
@orth = orth
|
9
|
+
@tags = tags
|
10
|
+
end
|
11
|
+
|
12
|
+
def symbol?
|
13
|
+
@tags.eql? "tsym"
|
14
|
+
end
|
15
|
+
|
16
|
+
def interp?
|
17
|
+
@tags.eql? "interp"
|
18
|
+
end
|
19
|
+
|
20
|
+
def word?
|
21
|
+
not interp? and not number? and not agl?
|
22
|
+
end
|
23
|
+
|
24
|
+
def number?
|
25
|
+
@tags.include?("tnum")
|
26
|
+
end
|
27
|
+
|
28
|
+
def integer?
|
29
|
+
@tags.include?("tnum:integer")
|
30
|
+
end
|
31
|
+
|
32
|
+
def float?
|
33
|
+
@tags.include?("tnum:frac")
|
34
|
+
end
|
35
|
+
|
36
|
+
def qublic?
|
37
|
+
@tags.include?("qub")
|
38
|
+
end
|
39
|
+
|
40
|
+
def agl?
|
41
|
+
@tags.include?("agl")
|
42
|
+
end
|
43
|
+
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,58 @@
|
|
1
|
+
module NLP
|
2
|
+
class TokenScanner
|
3
|
+
|
4
|
+
attr_reader :text, :tokens
|
5
|
+
|
6
|
+
def initialize(text)
|
7
|
+
@text = text
|
8
|
+
@pos = 0
|
9
|
+
@tokens = @text.flatten
|
10
|
+
end
|
11
|
+
|
12
|
+
def next(type)
|
13
|
+
@pos+=1
|
14
|
+
|
15
|
+
case type
|
16
|
+
when :word
|
17
|
+
while @pos < @tokens.size and !@tokens[@pos].word?
|
18
|
+
@pos+= 1
|
19
|
+
end
|
20
|
+
|
21
|
+
when :interp
|
22
|
+
while @pos < @tokens.size and !@tokens[@pos].interp?
|
23
|
+
@pos+= 1
|
24
|
+
end
|
25
|
+
|
26
|
+
when :number
|
27
|
+
while @pos < @tokens.size and !@tokens[@pos].number?
|
28
|
+
@pos+= 1
|
29
|
+
end
|
30
|
+
when :alphanum
|
31
|
+
while @pos < @tokens.size and !@tokens[@pos].number? and !@tokens[@pos].word?
|
32
|
+
@pos+= 1
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def current
|
38
|
+
if @pos == @tokens.size
|
39
|
+
nil
|
40
|
+
else
|
41
|
+
@tokens[@pos]
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
def rewind
|
46
|
+
@pos = 0
|
47
|
+
end
|
48
|
+
|
49
|
+
def index
|
50
|
+
@pos
|
51
|
+
end
|
52
|
+
|
53
|
+
def end?
|
54
|
+
@pos == tokens.size
|
55
|
+
end
|
56
|
+
|
57
|
+
end
|
58
|
+
end
|
data/lib/tagger/word.rb
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
module NLP
|
2
|
+
class Word < Token
|
3
|
+
|
4
|
+
include Inflectable
|
5
|
+
include Meaningable
|
6
|
+
|
7
|
+
attr_reader :lemat
|
8
|
+
attr_accessor :category
|
9
|
+
|
10
|
+
def initialize(word, lemat, tags)
|
11
|
+
super(word,tags)
|
12
|
+
@lemat = lemat
|
13
|
+
end
|
14
|
+
|
15
|
+
def inflection
|
16
|
+
@tags
|
17
|
+
end
|
18
|
+
|
19
|
+
end
|
20
|
+
end
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: nlp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 7
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 2
|
9
|
-
-
|
10
|
-
version: 0.2.
|
9
|
+
- 8
|
10
|
+
version: 0.2.8
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- knife
|
@@ -59,9 +59,26 @@ extra_rdoc_files:
|
|
59
59
|
files:
|
60
60
|
- dict/liwc
|
61
61
|
- dict/rid
|
62
|
-
- lib/
|
62
|
+
- lib/analizators/analyzer.rb
|
63
|
+
- lib/analizators/liwc_analyzer.rb
|
64
|
+
- lib/analizators/rid_analyzer.rb
|
65
|
+
- lib/dictionaries/category.rb
|
66
|
+
- lib/dictionaries/dictionary.rb
|
67
|
+
- lib/dictionaries/liwc_category.rb
|
68
|
+
- lib/dictionaries/pl_trie.rb
|
69
|
+
- lib/dictionaries/rid_category.rb
|
63
70
|
- lib/nlp.rb
|
64
71
|
- lib/stdlib/ext/array.rb
|
72
|
+
- lib/tagger/emoticon.rb
|
73
|
+
- lib/tagger/inflectable.rb
|
74
|
+
- lib/tagger/lemmatizer.rb
|
75
|
+
- lib/tagger/meaningable.rb
|
76
|
+
- lib/tagger/sentence.rb
|
77
|
+
- lib/tagger/takipi_web_service.rb
|
78
|
+
- lib/tagger/text.rb
|
79
|
+
- lib/tagger/token.rb
|
80
|
+
- lib/tagger/token_scanner.rb
|
81
|
+
- lib/tagger/word.rb
|
65
82
|
- lib/text_statistics.rb
|
66
83
|
- LICENSE
|
67
84
|
- README.rdoc
|
data/lib/morfeusz.rb
DELETED
@@ -1,69 +0,0 @@
|
|
1
|
-
# Ruby bindings for Morfeusz v. 0.1
|
2
|
-
# Author: Aleksander Pohl
|
3
|
-
# apohllo@o2.pl
|
4
|
-
|
5
|
-
require 'rubygems'
|
6
|
-
require 'inline'
|
7
|
-
require 'singleton'
|
8
|
-
require 'iconv'
|
9
|
-
module NLP
|
10
|
-
module Morfeusz
|
11
|
-
MORFOPT_ENCODING = 1
|
12
|
-
MORFEUSZ_UTF_8 = 8
|
13
|
-
class Morfeusz
|
14
|
-
include Singleton
|
15
|
-
|
16
|
-
inline(:C) do |builder|
|
17
|
-
builder.include '"morfeusz.h"'
|
18
|
-
builder.add_compile_flags '-lmorfeusz', '-I/home/knife/morf/include/'
|
19
|
-
builder.c <<-END
|
20
|
-
void initialize(){
|
21
|
-
morfeusz_set_option(#{MORFOPT_ENCODING},#{MORFEUSZ_UTF_8});
|
22
|
-
}
|
23
|
-
END
|
24
|
-
|
25
|
-
builder.c <<-END
|
26
|
-
char * about(){
|
27
|
-
return morfeusz_about();
|
28
|
-
}
|
29
|
-
END
|
30
|
-
|
31
|
-
builder.c <<-END
|
32
|
-
VALUE _base(VALUE str){
|
33
|
-
char * p;
|
34
|
-
int index = 0;
|
35
|
-
VALUE arr = rb_ary_new();
|
36
|
-
int id_push = rb_intern("push");
|
37
|
-
p = StringValuePtr(str);
|
38
|
-
InterpMorf* result = morfeusz_analyse(p);
|
39
|
-
InterpMorf el;
|
40
|
-
while((el = result[index++]).k != -1){
|
41
|
-
if(el.haslo != NULL){
|
42
|
-
rb_funcall(arr,id_push,1,rb_str_new2(el.haslo));
|
43
|
-
}
|
44
|
-
}
|
45
|
-
return arr;
|
46
|
-
}
|
47
|
-
END
|
48
|
-
|
49
|
-
def base(word)
|
50
|
-
# _base(word)
|
51
|
-
_base(word).collect{|e| e}
|
52
|
-
end
|
53
|
-
|
54
|
-
end
|
55
|
-
end
|
56
|
-
|
57
|
-
class Lexeme
|
58
|
-
attr_reader :base_form
|
59
|
-
def initialize(base_form)
|
60
|
-
@base_form = base_form
|
61
|
-
end
|
62
|
-
|
63
|
-
def self.find(word)
|
64
|
-
Morfeusz.instance.base(word).collect{|bf| Lexeme.new(bf)}
|
65
|
-
end
|
66
|
-
|
67
|
-
end
|
68
|
-
end
|
69
|
-
end
|