nlp 0.2.6 → 0.2.7
Sign up to get free protection for your applications and to get access to all the features.
- data/dict/liwc +11 -12
- data/dict/rid +7 -7
- data/lib/nlp.rb +27 -5
- data/lib/stdlib/ext/array.rb +1 -0
- data/lib/text_statistics.rb +53 -0
- metadata +42 -55
- data/lib/analyzer.rb +0 -50
- data/lib/category.rb +0 -27
- data/lib/dictionary.rb +0 -85
- data/lib/emoticon.rb +0 -14
- data/lib/inflectable.rb +0 -60
- data/lib/lemmatizer.rb +0 -112
- data/lib/liwc_analyzer.rb +0 -74
- data/lib/liwc_category.rb +0 -61
- data/lib/meaningable.rb +0 -69
- data/lib/rid_analyzer.rb +0 -10
- data/lib/rid_category.rb +0 -17
- data/lib/sentence.rb +0 -24
- data/lib/statistic.rb +0 -55
- data/lib/stdlib/ext/string.rb +0 -19
- data/lib/stree.rb +0 -85
- data/lib/takipi_web_service.rb +0 -51
- data/lib/text.rb +0 -26
- data/lib/token.rb +0 -37
- data/lib/token_scanner.rb +0 -60
- data/lib/word.rb +0 -23
- data/test/analyzer_test.rb +0 -25
- data/test/helper.rb +0 -9
- data/test/lemmatizer_test.rb +0 -73
- data/test/meaningable_test.rb +0 -28
- data/test/nlp_test_suite.rb +0 -11
- data/test/sentence_test.rb +0 -26
- data/test/test_nlp.rb +0 -7
- data/test/text_test.rb +0 -29
- data/test/token_scanner_test.rb +0 -28
- data/test/token_test.rb +0 -37
- data/test/word_test.rb +0 -45
data/dict/liwc
CHANGED
@@ -509,7 +509,7 @@ PROCESY_PSYCHOLOGICZNE
|
|
509
509
|
obraźliwy
|
510
510
|
obrona
|
511
511
|
oburzenie
|
512
|
-
obwiniać
|
512
|
+
obwiniać
|
513
513
|
ofiara
|
514
514
|
okrutny
|
515
515
|
oszustwo
|
@@ -637,7 +637,7 @@ PROCESY_PSYCHOLOGICZNE
|
|
637
637
|
tragedia
|
638
638
|
tragiczny
|
639
639
|
współczucie
|
640
|
-
|
640
|
+
współczuć
|
641
641
|
żal
|
642
642
|
żałosny
|
643
643
|
zawalić
|
@@ -851,7 +851,7 @@ PROCESY_PSYCHOLOGICZNE
|
|
851
851
|
zaprzeczać
|
852
852
|
zatkać
|
853
853
|
zatrzymanie
|
854
|
-
zatrzymywać
|
854
|
+
zatrzymywać
|
855
855
|
zawada
|
856
856
|
zawierać
|
857
857
|
zignorować
|
@@ -1153,7 +1153,7 @@ PROCESY_PSYCHOLOGICZNE
|
|
1153
1153
|
kontakt
|
1154
1154
|
łapać
|
1155
1155
|
macać
|
1156
|
-
obmacywać
|
1156
|
+
obmacywać
|
1157
1157
|
odczuwać
|
1158
1158
|
poczucie
|
1159
1159
|
przecierać
|
@@ -1614,7 +1614,7 @@ OSOBISTE
|
|
1614
1614
|
wymagania
|
1615
1615
|
zadanie domowe
|
1616
1616
|
zaganiany
|
1617
|
-
zarządzanie
|
1617
|
+
zarządzanie
|
1618
1618
|
zespół
|
1619
1619
|
zgłoszenie
|
1620
1620
|
PRACA
|
@@ -1664,7 +1664,7 @@ OSOBISTE
|
|
1664
1664
|
przedsiębiorstwo
|
1665
1665
|
przemysł
|
1666
1666
|
przemysłowy
|
1667
|
-
przyjmować
|
1667
|
+
przyjmować
|
1668
1668
|
rozmowa kwalfikacyjna
|
1669
1669
|
rynek
|
1670
1670
|
sekretarka
|
@@ -1677,7 +1677,7 @@ OSOBISTE
|
|
1677
1677
|
współpracownik
|
1678
1678
|
wynagrodzenie
|
1679
1679
|
wypłata
|
1680
|
-
wyposażenie
|
1680
|
+
wyposażenie
|
1681
1681
|
wyrzucać
|
1682
1682
|
wywalać
|
1683
1683
|
zajęcie
|
@@ -1733,7 +1733,7 @@ OSOBISTE
|
|
1733
1733
|
przegrany
|
1734
1734
|
przewyższać
|
1735
1735
|
przodować
|
1736
|
-
rezultat
|
1736
|
+
rezultat
|
1737
1737
|
rozwiązać
|
1738
1738
|
rozwiązanie
|
1739
1739
|
silny
|
@@ -1918,7 +1918,6 @@ OSOBISTE
|
|
1918
1918
|
pożyczyć
|
1919
1919
|
profit
|
1920
1920
|
prowizja
|
1921
|
-
prowizja
|
1922
1921
|
rentowny
|
1923
1922
|
sklep
|
1924
1923
|
spadek
|
@@ -2000,8 +1999,8 @@ OSOBISTE
|
|
2000
1999
|
wiara
|
2001
2000
|
wieczność
|
2002
2001
|
wieczny
|
2003
|
-
|
2004
|
-
|
2002
|
+
wielkanoc
|
2003
|
+
żyd
|
2005
2004
|
żydowski
|
2006
2005
|
ŚMIERĆ
|
2007
2006
|
cmentarz
|
@@ -2383,7 +2382,7 @@ OSOBISTE
|
|
2383
2382
|
zmęczony
|
2384
2383
|
TOALETA
|
2385
2384
|
czysty
|
2386
|
-
czyścić
|
2385
|
+
czyścić
|
2387
2386
|
kąpać
|
2388
2387
|
kąpiel
|
2389
2388
|
golić
|
data/dict/rid
CHANGED
@@ -936,7 +936,7 @@ PIERWOTNE
|
|
936
936
|
schnąć
|
937
937
|
śmierć
|
938
938
|
spokój
|
939
|
-
spokojny
|
939
|
+
spokojny
|
940
940
|
stagnacja
|
941
941
|
statyczny
|
942
942
|
tapczan
|
@@ -959,7 +959,7 @@ PIERWOTNE
|
|
959
959
|
zmiękczenie
|
960
960
|
znużenie
|
961
961
|
zrelaksować
|
962
|
-
zrelaksowany
|
962
|
+
zrelaksowany
|
963
963
|
PODRÓŻ
|
964
964
|
agitować
|
965
965
|
aktywność
|
@@ -1140,7 +1140,7 @@ PIERWOTNE
|
|
1140
1140
|
REGRESYWNOŚĆ
|
1141
1141
|
NIEZNANE
|
1142
1142
|
bezgraniczny
|
1143
|
-
bezimienny
|
1143
|
+
bezimienny
|
1144
1144
|
bezkształtny
|
1145
1145
|
cudowny
|
1146
1146
|
czarodziej
|
@@ -1490,7 +1490,7 @@ PIERWOTNE
|
|
1490
1490
|
pływak
|
1491
1491
|
podnosić
|
1492
1492
|
podrzucać
|
1493
|
-
podskakiwać
|
1493
|
+
podskakiwać
|
1494
1494
|
powiesić
|
1495
1495
|
powstały
|
1496
1496
|
powstawać
|
@@ -1791,7 +1791,7 @@ WTORNE
|
|
1791
1791
|
koncept
|
1792
1792
|
konkludować
|
1793
1793
|
konsekwentnie
|
1794
|
-
|
1794
|
+
kryterium
|
1795
1795
|
kwantyfikować
|
1796
1796
|
kwestia
|
1797
1797
|
liczyć
|
@@ -1961,7 +1961,7 @@ WTORNE
|
|
1961
1961
|
obdarzać
|
1962
1962
|
obiecać
|
1963
1963
|
obietnica
|
1964
|
-
obsłużyć
|
1964
|
+
obsłużyć
|
1965
1965
|
obwieścić
|
1966
1966
|
ochraniać
|
1967
1967
|
oddziałowywać
|
@@ -2960,7 +2960,7 @@ EMOCJE
|
|
2960
2960
|
sprzeciwiać
|
2961
2961
|
sprzeczać
|
2962
2962
|
sprzeczać
|
2963
|
-
srogi
|
2963
|
+
srogi
|
2964
2964
|
strzał
|
2965
2965
|
strzała
|
2966
2966
|
strzelać
|
data/lib/nlp.rb
CHANGED
@@ -1,14 +1,36 @@
|
|
1
1
|
module NLP
|
2
|
-
|
3
2
|
TAKIPI_XML_FILE = "/tmp/output.xml"
|
4
3
|
DICTIONARY_CACHE_DIR = "~/"
|
5
|
-
|
6
|
-
|
7
4
|
end
|
8
5
|
|
9
6
|
|
10
7
|
require 'stdlib/ext/array'
|
11
|
-
require '
|
12
|
-
|
8
|
+
require 'morfeusz'
|
9
|
+
|
10
|
+
require "analizators/analyzer"
|
11
|
+
require "analizators/rid_analyzer.rb"
|
12
|
+
require "analizators/liwc_analyzer.rb"
|
13
|
+
|
14
|
+
require "dictionaries/pl_trie"
|
15
|
+
require 'dictionaries/dictionary'
|
16
|
+
require 'dictionaries/category'
|
17
|
+
require "dictionaries/liwc_category"
|
18
|
+
require "dictionaries/rid_category"
|
19
|
+
|
20
|
+
|
21
|
+
require "tagger/inflectable"
|
22
|
+
require "tagger/meaningable"
|
23
|
+
require 'tagger/token'
|
24
|
+
require 'tagger/word'
|
25
|
+
require 'tagger/emoticon'
|
26
|
+
require 'tagger/sentence'
|
27
|
+
require 'tagger/text'
|
28
|
+
require "tagger/token_scanner"
|
29
|
+
require "tagger/takipi_web_service"
|
30
|
+
require "tagger/lemmatizer"
|
31
|
+
|
32
|
+
require "text_statistics"
|
13
33
|
|
34
|
+
require 'jcode'
|
35
|
+
$KCODE = "UTF8"
|
14
36
|
|
data/lib/stdlib/ext/array.rb
CHANGED
@@ -0,0 +1,53 @@
|
|
1
|
+
module NLP
|
2
|
+
class TextStatistics
|
3
|
+
|
4
|
+
attr_accessor :total_words, :hash
|
5
|
+
attr_reader :cwords, :words, :total_words, :word_count, :scores
|
6
|
+
|
7
|
+
def initialize
|
8
|
+
@word_count = 0 # number of found words
|
9
|
+
@total_words = 0 # total number of words
|
10
|
+
@scores = Hash.new { 0 } #numbers of words in each category
|
11
|
+
@words = [] #found words
|
12
|
+
@cwords = Hash.new {nil} #found words grouped into categories
|
13
|
+
@hash = {} #additional data
|
14
|
+
end
|
15
|
+
|
16
|
+
#Adds word and its category to stats.
|
17
|
+
def add(word,categories)
|
18
|
+
categories.each do |category|
|
19
|
+
@cwords[category] = [] if @cwords[category].nil?
|
20
|
+
@cwords[category].push word
|
21
|
+
@scores[category] += 1
|
22
|
+
end
|
23
|
+
@words.push word
|
24
|
+
@word_count += 1
|
25
|
+
end
|
26
|
+
|
27
|
+
def [](key)
|
28
|
+
@hash[key]
|
29
|
+
end
|
30
|
+
|
31
|
+
def []=(key,value)
|
32
|
+
@hash[key] = value
|
33
|
+
end
|
34
|
+
|
35
|
+
def category_participation(categories)
|
36
|
+
sorted_scores = @scores.to_a.sort_by{ |result| -result[1] }
|
37
|
+
r = {}
|
38
|
+
categories.each do |cat|
|
39
|
+
r[cat] = percentage_distribution(sorted_scores){|c| c.send(cat.to_s+'?')}
|
40
|
+
end
|
41
|
+
r
|
42
|
+
end
|
43
|
+
|
44
|
+
private
|
45
|
+
def percentage_distribution scores, &block
|
46
|
+
all = scores.map{|k,v| v}.inject(0){|e,m|m = m +e}
|
47
|
+
sum = scores.select{|result| yield result[0]}.inject(0){|count,result| count + result[1]}
|
48
|
+
Float(sum)/all
|
49
|
+
end
|
50
|
+
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: nlp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
5
|
-
prerelease:
|
4
|
+
hash: 25
|
5
|
+
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 2
|
9
|
-
-
|
10
|
-
version: 0.2.
|
9
|
+
- 7
|
10
|
+
version: 0.2.7
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- knife
|
@@ -15,10 +15,38 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2011-
|
19
|
-
|
20
|
-
|
21
|
-
|
18
|
+
date: 2011-09-13 00:00:00 Z
|
19
|
+
dependencies:
|
20
|
+
- !ruby/object:Gem::Dependency
|
21
|
+
name: savon
|
22
|
+
prerelease: false
|
23
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
24
|
+
none: false
|
25
|
+
requirements:
|
26
|
+
- - "="
|
27
|
+
- !ruby/object:Gem::Version
|
28
|
+
hash: 17
|
29
|
+
segments:
|
30
|
+
- 0
|
31
|
+
- 7
|
32
|
+
- 9
|
33
|
+
version: 0.7.9
|
34
|
+
type: :runtime
|
35
|
+
version_requirements: *id001
|
36
|
+
- !ruby/object:Gem::Dependency
|
37
|
+
name: ds
|
38
|
+
prerelease: false
|
39
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
40
|
+
none: false
|
41
|
+
requirements:
|
42
|
+
- - ">="
|
43
|
+
- !ruby/object:Gem::Version
|
44
|
+
hash: 3
|
45
|
+
segments:
|
46
|
+
- 0
|
47
|
+
version: "0"
|
48
|
+
type: :runtime
|
49
|
+
version_requirements: *id002
|
22
50
|
description: Tools for processing polish language. Tokenization, scanning, categorization...
|
23
51
|
email: satre@o2.pl
|
24
52
|
executables: []
|
@@ -31,49 +59,18 @@ extra_rdoc_files:
|
|
31
59
|
files:
|
32
60
|
- dict/liwc
|
33
61
|
- dict/rid
|
34
|
-
- lib/analyzer.rb
|
35
|
-
- lib/category.rb
|
36
|
-
- lib/dictionary.rb
|
37
|
-
- lib/emoticon.rb
|
38
|
-
- lib/inflectable.rb
|
39
|
-
- lib/lemmatizer.rb
|
40
|
-
- lib/liwc_analyzer.rb
|
41
|
-
- lib/liwc_category.rb
|
42
|
-
- lib/meaningable.rb
|
43
62
|
- lib/morfeusz.rb
|
44
63
|
- lib/nlp.rb
|
45
|
-
- lib/rid_analyzer.rb
|
46
|
-
- lib/rid_category.rb
|
47
|
-
- lib/sentence.rb
|
48
|
-
- lib/statistic.rb
|
49
64
|
- lib/stdlib/ext/array.rb
|
50
|
-
- lib/
|
51
|
-
- lib/stree.rb
|
52
|
-
- lib/takipi_web_service.rb
|
53
|
-
- lib/text.rb
|
54
|
-
- lib/token.rb
|
55
|
-
- lib/token_scanner.rb
|
56
|
-
- lib/word.rb
|
65
|
+
- lib/text_statistics.rb
|
57
66
|
- LICENSE
|
58
67
|
- README.rdoc
|
59
|
-
- test/sentence_test.rb
|
60
|
-
- test/analyzer_test.rb
|
61
|
-
- test/meaningable_test.rb
|
62
|
-
- test/token_scanner_test.rb
|
63
|
-
- test/helper.rb
|
64
|
-
- test/nlp_test_suite.rb
|
65
|
-
- test/test_nlp.rb
|
66
|
-
- test/word_test.rb
|
67
|
-
- test/lemmatizer_test.rb
|
68
|
-
- test/token_test.rb
|
69
|
-
- test/text_test.rb
|
70
|
-
has_rdoc: true
|
71
68
|
homepage: http://github.com/knife/nlp
|
72
69
|
licenses: []
|
73
70
|
|
74
71
|
post_install_message:
|
75
|
-
rdoc_options:
|
76
|
-
|
72
|
+
rdoc_options: []
|
73
|
+
|
77
74
|
require_paths:
|
78
75
|
- lib
|
79
76
|
required_ruby_version: !ruby/object:Gem::Requirement
|
@@ -97,19 +94,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
97
94
|
requirements: []
|
98
95
|
|
99
96
|
rubyforge_project:
|
100
|
-
rubygems_version: 1.
|
97
|
+
rubygems_version: 1.7.2
|
101
98
|
signing_key:
|
102
99
|
specification_version: 3
|
103
100
|
summary: Linguistics tools for processing polish language.
|
104
|
-
test_files:
|
105
|
-
|
106
|
-
- test/analyzer_test.rb
|
107
|
-
- test/meaningable_test.rb
|
108
|
-
- test/token_scanner_test.rb
|
109
|
-
- test/helper.rb
|
110
|
-
- test/nlp_test_suite.rb
|
111
|
-
- test/test_nlp.rb
|
112
|
-
- test/word_test.rb
|
113
|
-
- test/lemmatizer_test.rb
|
114
|
-
- test/token_test.rb
|
115
|
-
- test/text_test.rb
|
101
|
+
test_files: []
|
102
|
+
|
data/lib/analyzer.rb
DELETED
@@ -1,50 +0,0 @@
|
|
1
|
-
require 'dictionary'
|
2
|
-
#require 'morfeusz'
|
3
|
-
require 'token'
|
4
|
-
require 'word'
|
5
|
-
require 'token'
|
6
|
-
require 'text'
|
7
|
-
require 'emoticon'
|
8
|
-
require 'sentence'
|
9
|
-
require "token_scanner.rb"
|
10
|
-
require "lemmatizer"
|
11
|
-
require 'jcode'
|
12
|
-
require 'statistic'
|
13
|
-
$KODE = "UTF8"
|
14
|
-
|
15
|
-
module NLP
|
16
|
-
|
17
|
-
class Analyzer
|
18
|
-
|
19
|
-
def initialize(dict)
|
20
|
-
@dictionary = Dictionary.new(dict)
|
21
|
-
end
|
22
|
-
|
23
|
-
|
24
|
-
def analyze(scanner)
|
25
|
-
|
26
|
-
results = Statistic.new
|
27
|
-
|
28
|
-
while token = scanner.current
|
29
|
-
word = token.lemat
|
30
|
-
|
31
|
-
categories = @dictionary.find(word.gsub(/[^\w-]/, "" ))
|
32
|
-
unless categories.nil?
|
33
|
-
categories.each do |category|
|
34
|
-
puts "Znalazłem słowo #{word} : #{category}"
|
35
|
-
results.add(word,category)
|
36
|
-
end
|
37
|
-
end
|
38
|
-
|
39
|
-
results.total_words += 1
|
40
|
-
scanner.next(:word)
|
41
|
-
end
|
42
|
-
|
43
|
-
results
|
44
|
-
|
45
|
-
end
|
46
|
-
end
|
47
|
-
end
|
48
|
-
|
49
|
-
require "rid_analyzer.rb"
|
50
|
-
require "liwc_analyzer.rb"
|
data/lib/category.rb
DELETED
@@ -1,27 +0,0 @@
|
|
1
|
-
module NLP
|
2
|
-
class Category
|
3
|
-
attr_reader :parent, :name
|
4
|
-
|
5
|
-
def initialize(name, parent = nil)
|
6
|
-
@parent = parent
|
7
|
-
@name = name.to_sym
|
8
|
-
end
|
9
|
-
|
10
|
-
def path
|
11
|
-
@parent ? (@parent.path + '/' + name.to_s) : name.to_s
|
12
|
-
end
|
13
|
-
|
14
|
-
def root
|
15
|
-
category = self
|
16
|
-
while category.parent != nil
|
17
|
-
category = category.parent
|
18
|
-
end
|
19
|
-
category.name
|
20
|
-
end
|
21
|
-
|
22
|
-
def to_s
|
23
|
-
"#{path.inspect}"
|
24
|
-
end
|
25
|
-
|
26
|
-
end
|
27
|
-
end
|