nlp 0.2.6 → 0.2.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/dict/liwc +11 -12
- data/dict/rid +7 -7
- data/lib/nlp.rb +27 -5
- data/lib/stdlib/ext/array.rb +1 -0
- data/lib/text_statistics.rb +53 -0
- metadata +42 -55
- data/lib/analyzer.rb +0 -50
- data/lib/category.rb +0 -27
- data/lib/dictionary.rb +0 -85
- data/lib/emoticon.rb +0 -14
- data/lib/inflectable.rb +0 -60
- data/lib/lemmatizer.rb +0 -112
- data/lib/liwc_analyzer.rb +0 -74
- data/lib/liwc_category.rb +0 -61
- data/lib/meaningable.rb +0 -69
- data/lib/rid_analyzer.rb +0 -10
- data/lib/rid_category.rb +0 -17
- data/lib/sentence.rb +0 -24
- data/lib/statistic.rb +0 -55
- data/lib/stdlib/ext/string.rb +0 -19
- data/lib/stree.rb +0 -85
- data/lib/takipi_web_service.rb +0 -51
- data/lib/text.rb +0 -26
- data/lib/token.rb +0 -37
- data/lib/token_scanner.rb +0 -60
- data/lib/word.rb +0 -23
- data/test/analyzer_test.rb +0 -25
- data/test/helper.rb +0 -9
- data/test/lemmatizer_test.rb +0 -73
- data/test/meaningable_test.rb +0 -28
- data/test/nlp_test_suite.rb +0 -11
- data/test/sentence_test.rb +0 -26
- data/test/test_nlp.rb +0 -7
- data/test/text_test.rb +0 -29
- data/test/token_scanner_test.rb +0 -28
- data/test/token_test.rb +0 -37
- data/test/word_test.rb +0 -45
data/dict/liwc
CHANGED
@@ -509,7 +509,7 @@ PROCESY_PSYCHOLOGICZNE
|
|
509
509
|
obraźliwy
|
510
510
|
obrona
|
511
511
|
oburzenie
|
512
|
-
obwiniać
|
512
|
+
obwiniać
|
513
513
|
ofiara
|
514
514
|
okrutny
|
515
515
|
oszustwo
|
@@ -637,7 +637,7 @@ PROCESY_PSYCHOLOGICZNE
|
|
637
637
|
tragedia
|
638
638
|
tragiczny
|
639
639
|
współczucie
|
640
|
-
|
640
|
+
współczuć
|
641
641
|
żal
|
642
642
|
żałosny
|
643
643
|
zawalić
|
@@ -851,7 +851,7 @@ PROCESY_PSYCHOLOGICZNE
|
|
851
851
|
zaprzeczać
|
852
852
|
zatkać
|
853
853
|
zatrzymanie
|
854
|
-
zatrzymywać
|
854
|
+
zatrzymywać
|
855
855
|
zawada
|
856
856
|
zawierać
|
857
857
|
zignorować
|
@@ -1153,7 +1153,7 @@ PROCESY_PSYCHOLOGICZNE
|
|
1153
1153
|
kontakt
|
1154
1154
|
łapać
|
1155
1155
|
macać
|
1156
|
-
obmacywać
|
1156
|
+
obmacywać
|
1157
1157
|
odczuwać
|
1158
1158
|
poczucie
|
1159
1159
|
przecierać
|
@@ -1614,7 +1614,7 @@ OSOBISTE
|
|
1614
1614
|
wymagania
|
1615
1615
|
zadanie domowe
|
1616
1616
|
zaganiany
|
1617
|
-
zarządzanie
|
1617
|
+
zarządzanie
|
1618
1618
|
zespół
|
1619
1619
|
zgłoszenie
|
1620
1620
|
PRACA
|
@@ -1664,7 +1664,7 @@ OSOBISTE
|
|
1664
1664
|
przedsiębiorstwo
|
1665
1665
|
przemysł
|
1666
1666
|
przemysłowy
|
1667
|
-
przyjmować
|
1667
|
+
przyjmować
|
1668
1668
|
rozmowa kwalfikacyjna
|
1669
1669
|
rynek
|
1670
1670
|
sekretarka
|
@@ -1677,7 +1677,7 @@ OSOBISTE
|
|
1677
1677
|
współpracownik
|
1678
1678
|
wynagrodzenie
|
1679
1679
|
wypłata
|
1680
|
-
wyposażenie
|
1680
|
+
wyposażenie
|
1681
1681
|
wyrzucać
|
1682
1682
|
wywalać
|
1683
1683
|
zajęcie
|
@@ -1733,7 +1733,7 @@ OSOBISTE
|
|
1733
1733
|
przegrany
|
1734
1734
|
przewyższać
|
1735
1735
|
przodować
|
1736
|
-
rezultat
|
1736
|
+
rezultat
|
1737
1737
|
rozwiązać
|
1738
1738
|
rozwiązanie
|
1739
1739
|
silny
|
@@ -1918,7 +1918,6 @@ OSOBISTE
|
|
1918
1918
|
pożyczyć
|
1919
1919
|
profit
|
1920
1920
|
prowizja
|
1921
|
-
prowizja
|
1922
1921
|
rentowny
|
1923
1922
|
sklep
|
1924
1923
|
spadek
|
@@ -2000,8 +1999,8 @@ OSOBISTE
|
|
2000
1999
|
wiara
|
2001
2000
|
wieczność
|
2002
2001
|
wieczny
|
2003
|
-
|
2004
|
-
|
2002
|
+
wielkanoc
|
2003
|
+
żyd
|
2005
2004
|
żydowski
|
2006
2005
|
ŚMIERĆ
|
2007
2006
|
cmentarz
|
@@ -2383,7 +2382,7 @@ OSOBISTE
|
|
2383
2382
|
zmęczony
|
2384
2383
|
TOALETA
|
2385
2384
|
czysty
|
2386
|
-
czyścić
|
2385
|
+
czyścić
|
2387
2386
|
kąpać
|
2388
2387
|
kąpiel
|
2389
2388
|
golić
|
data/dict/rid
CHANGED
@@ -936,7 +936,7 @@ PIERWOTNE
|
|
936
936
|
schnąć
|
937
937
|
śmierć
|
938
938
|
spokój
|
939
|
-
spokojny
|
939
|
+
spokojny
|
940
940
|
stagnacja
|
941
941
|
statyczny
|
942
942
|
tapczan
|
@@ -959,7 +959,7 @@ PIERWOTNE
|
|
959
959
|
zmiękczenie
|
960
960
|
znużenie
|
961
961
|
zrelaksować
|
962
|
-
zrelaksowany
|
962
|
+
zrelaksowany
|
963
963
|
PODRÓŻ
|
964
964
|
agitować
|
965
965
|
aktywność
|
@@ -1140,7 +1140,7 @@ PIERWOTNE
|
|
1140
1140
|
REGRESYWNOŚĆ
|
1141
1141
|
NIEZNANE
|
1142
1142
|
bezgraniczny
|
1143
|
-
bezimienny
|
1143
|
+
bezimienny
|
1144
1144
|
bezkształtny
|
1145
1145
|
cudowny
|
1146
1146
|
czarodziej
|
@@ -1490,7 +1490,7 @@ PIERWOTNE
|
|
1490
1490
|
pływak
|
1491
1491
|
podnosić
|
1492
1492
|
podrzucać
|
1493
|
-
podskakiwać
|
1493
|
+
podskakiwać
|
1494
1494
|
powiesić
|
1495
1495
|
powstały
|
1496
1496
|
powstawać
|
@@ -1791,7 +1791,7 @@ WTORNE
|
|
1791
1791
|
koncept
|
1792
1792
|
konkludować
|
1793
1793
|
konsekwentnie
|
1794
|
-
|
1794
|
+
kryterium
|
1795
1795
|
kwantyfikować
|
1796
1796
|
kwestia
|
1797
1797
|
liczyć
|
@@ -1961,7 +1961,7 @@ WTORNE
|
|
1961
1961
|
obdarzać
|
1962
1962
|
obiecać
|
1963
1963
|
obietnica
|
1964
|
-
obsłużyć
|
1964
|
+
obsłużyć
|
1965
1965
|
obwieścić
|
1966
1966
|
ochraniać
|
1967
1967
|
oddziałowywać
|
@@ -2960,7 +2960,7 @@ EMOCJE
|
|
2960
2960
|
sprzeciwiać
|
2961
2961
|
sprzeczać
|
2962
2962
|
sprzeczać
|
2963
|
-
srogi
|
2963
|
+
srogi
|
2964
2964
|
strzał
|
2965
2965
|
strzała
|
2966
2966
|
strzelać
|
data/lib/nlp.rb
CHANGED
@@ -1,14 +1,36 @@
|
|
1
1
|
module NLP
|
2
|
-
|
3
2
|
TAKIPI_XML_FILE = "/tmp/output.xml"
|
4
3
|
DICTIONARY_CACHE_DIR = "~/"
|
5
|
-
|
6
|
-
|
7
4
|
end
|
8
5
|
|
9
6
|
|
10
7
|
require 'stdlib/ext/array'
|
11
|
-
require '
|
12
|
-
|
8
|
+
require 'morfeusz'
|
9
|
+
|
10
|
+
require "analizators/analyzer"
|
11
|
+
require "analizators/rid_analyzer.rb"
|
12
|
+
require "analizators/liwc_analyzer.rb"
|
13
|
+
|
14
|
+
require "dictionaries/pl_trie"
|
15
|
+
require 'dictionaries/dictionary'
|
16
|
+
require 'dictionaries/category'
|
17
|
+
require "dictionaries/liwc_category"
|
18
|
+
require "dictionaries/rid_category"
|
19
|
+
|
20
|
+
|
21
|
+
require "tagger/inflectable"
|
22
|
+
require "tagger/meaningable"
|
23
|
+
require 'tagger/token'
|
24
|
+
require 'tagger/word'
|
25
|
+
require 'tagger/emoticon'
|
26
|
+
require 'tagger/sentence'
|
27
|
+
require 'tagger/text'
|
28
|
+
require "tagger/token_scanner"
|
29
|
+
require "tagger/takipi_web_service"
|
30
|
+
require "tagger/lemmatizer"
|
31
|
+
|
32
|
+
require "text_statistics"
|
13
33
|
|
34
|
+
require 'jcode'
|
35
|
+
$KCODE = "UTF8"
|
14
36
|
|
data/lib/stdlib/ext/array.rb
CHANGED
@@ -0,0 +1,53 @@
|
|
1
|
+
module NLP
|
2
|
+
class TextStatistics
|
3
|
+
|
4
|
+
attr_accessor :total_words, :hash
|
5
|
+
attr_reader :cwords, :words, :total_words, :word_count, :scores
|
6
|
+
|
7
|
+
def initialize
|
8
|
+
@word_count = 0 # number of found words
|
9
|
+
@total_words = 0 # total number of words
|
10
|
+
@scores = Hash.new { 0 } #numbers of words in each category
|
11
|
+
@words = [] #found words
|
12
|
+
@cwords = Hash.new {nil} #found words grouped into categories
|
13
|
+
@hash = {} #additional data
|
14
|
+
end
|
15
|
+
|
16
|
+
#Adds word and its category to stats.
|
17
|
+
def add(word,categories)
|
18
|
+
categories.each do |category|
|
19
|
+
@cwords[category] = [] if @cwords[category].nil?
|
20
|
+
@cwords[category].push word
|
21
|
+
@scores[category] += 1
|
22
|
+
end
|
23
|
+
@words.push word
|
24
|
+
@word_count += 1
|
25
|
+
end
|
26
|
+
|
27
|
+
def [](key)
|
28
|
+
@hash[key]
|
29
|
+
end
|
30
|
+
|
31
|
+
def []=(key,value)
|
32
|
+
@hash[key] = value
|
33
|
+
end
|
34
|
+
|
35
|
+
def category_participation(categories)
|
36
|
+
sorted_scores = @scores.to_a.sort_by{ |result| -result[1] }
|
37
|
+
r = {}
|
38
|
+
categories.each do |cat|
|
39
|
+
r[cat] = percentage_distribution(sorted_scores){|c| c.send(cat.to_s+'?')}
|
40
|
+
end
|
41
|
+
r
|
42
|
+
end
|
43
|
+
|
44
|
+
private
|
45
|
+
def percentage_distribution scores, &block
|
46
|
+
all = scores.map{|k,v| v}.inject(0){|e,m|m = m +e}
|
47
|
+
sum = scores.select{|result| yield result[0]}.inject(0){|count,result| count + result[1]}
|
48
|
+
Float(sum)/all
|
49
|
+
end
|
50
|
+
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: nlp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
5
|
-
prerelease:
|
4
|
+
hash: 25
|
5
|
+
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 2
|
9
|
-
-
|
10
|
-
version: 0.2.
|
9
|
+
- 7
|
10
|
+
version: 0.2.7
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- knife
|
@@ -15,10 +15,38 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2011-
|
19
|
-
|
20
|
-
|
21
|
-
|
18
|
+
date: 2011-09-13 00:00:00 Z
|
19
|
+
dependencies:
|
20
|
+
- !ruby/object:Gem::Dependency
|
21
|
+
name: savon
|
22
|
+
prerelease: false
|
23
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
24
|
+
none: false
|
25
|
+
requirements:
|
26
|
+
- - "="
|
27
|
+
- !ruby/object:Gem::Version
|
28
|
+
hash: 17
|
29
|
+
segments:
|
30
|
+
- 0
|
31
|
+
- 7
|
32
|
+
- 9
|
33
|
+
version: 0.7.9
|
34
|
+
type: :runtime
|
35
|
+
version_requirements: *id001
|
36
|
+
- !ruby/object:Gem::Dependency
|
37
|
+
name: ds
|
38
|
+
prerelease: false
|
39
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
40
|
+
none: false
|
41
|
+
requirements:
|
42
|
+
- - ">="
|
43
|
+
- !ruby/object:Gem::Version
|
44
|
+
hash: 3
|
45
|
+
segments:
|
46
|
+
- 0
|
47
|
+
version: "0"
|
48
|
+
type: :runtime
|
49
|
+
version_requirements: *id002
|
22
50
|
description: Tools for processing polish language. Tokenization, scanning, categorization...
|
23
51
|
email: satre@o2.pl
|
24
52
|
executables: []
|
@@ -31,49 +59,18 @@ extra_rdoc_files:
|
|
31
59
|
files:
|
32
60
|
- dict/liwc
|
33
61
|
- dict/rid
|
34
|
-
- lib/analyzer.rb
|
35
|
-
- lib/category.rb
|
36
|
-
- lib/dictionary.rb
|
37
|
-
- lib/emoticon.rb
|
38
|
-
- lib/inflectable.rb
|
39
|
-
- lib/lemmatizer.rb
|
40
|
-
- lib/liwc_analyzer.rb
|
41
|
-
- lib/liwc_category.rb
|
42
|
-
- lib/meaningable.rb
|
43
62
|
- lib/morfeusz.rb
|
44
63
|
- lib/nlp.rb
|
45
|
-
- lib/rid_analyzer.rb
|
46
|
-
- lib/rid_category.rb
|
47
|
-
- lib/sentence.rb
|
48
|
-
- lib/statistic.rb
|
49
64
|
- lib/stdlib/ext/array.rb
|
50
|
-
- lib/
|
51
|
-
- lib/stree.rb
|
52
|
-
- lib/takipi_web_service.rb
|
53
|
-
- lib/text.rb
|
54
|
-
- lib/token.rb
|
55
|
-
- lib/token_scanner.rb
|
56
|
-
- lib/word.rb
|
65
|
+
- lib/text_statistics.rb
|
57
66
|
- LICENSE
|
58
67
|
- README.rdoc
|
59
|
-
- test/sentence_test.rb
|
60
|
-
- test/analyzer_test.rb
|
61
|
-
- test/meaningable_test.rb
|
62
|
-
- test/token_scanner_test.rb
|
63
|
-
- test/helper.rb
|
64
|
-
- test/nlp_test_suite.rb
|
65
|
-
- test/test_nlp.rb
|
66
|
-
- test/word_test.rb
|
67
|
-
- test/lemmatizer_test.rb
|
68
|
-
- test/token_test.rb
|
69
|
-
- test/text_test.rb
|
70
|
-
has_rdoc: true
|
71
68
|
homepage: http://github.com/knife/nlp
|
72
69
|
licenses: []
|
73
70
|
|
74
71
|
post_install_message:
|
75
|
-
rdoc_options:
|
76
|
-
|
72
|
+
rdoc_options: []
|
73
|
+
|
77
74
|
require_paths:
|
78
75
|
- lib
|
79
76
|
required_ruby_version: !ruby/object:Gem::Requirement
|
@@ -97,19 +94,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
97
94
|
requirements: []
|
98
95
|
|
99
96
|
rubyforge_project:
|
100
|
-
rubygems_version: 1.
|
97
|
+
rubygems_version: 1.7.2
|
101
98
|
signing_key:
|
102
99
|
specification_version: 3
|
103
100
|
summary: Linguistics tools for processing polish language.
|
104
|
-
test_files:
|
105
|
-
|
106
|
-
- test/analyzer_test.rb
|
107
|
-
- test/meaningable_test.rb
|
108
|
-
- test/token_scanner_test.rb
|
109
|
-
- test/helper.rb
|
110
|
-
- test/nlp_test_suite.rb
|
111
|
-
- test/test_nlp.rb
|
112
|
-
- test/word_test.rb
|
113
|
-
- test/lemmatizer_test.rb
|
114
|
-
- test/token_test.rb
|
115
|
-
- test/text_test.rb
|
101
|
+
test_files: []
|
102
|
+
|
data/lib/analyzer.rb
DELETED
@@ -1,50 +0,0 @@
|
|
1
|
-
require 'dictionary'
|
2
|
-
#require 'morfeusz'
|
3
|
-
require 'token'
|
4
|
-
require 'word'
|
5
|
-
require 'token'
|
6
|
-
require 'text'
|
7
|
-
require 'emoticon'
|
8
|
-
require 'sentence'
|
9
|
-
require "token_scanner.rb"
|
10
|
-
require "lemmatizer"
|
11
|
-
require 'jcode'
|
12
|
-
require 'statistic'
|
13
|
-
$KODE = "UTF8"
|
14
|
-
|
15
|
-
module NLP
|
16
|
-
|
17
|
-
class Analyzer
|
18
|
-
|
19
|
-
def initialize(dict)
|
20
|
-
@dictionary = Dictionary.new(dict)
|
21
|
-
end
|
22
|
-
|
23
|
-
|
24
|
-
def analyze(scanner)
|
25
|
-
|
26
|
-
results = Statistic.new
|
27
|
-
|
28
|
-
while token = scanner.current
|
29
|
-
word = token.lemat
|
30
|
-
|
31
|
-
categories = @dictionary.find(word.gsub(/[^\w-]/, "" ))
|
32
|
-
unless categories.nil?
|
33
|
-
categories.each do |category|
|
34
|
-
puts "Znalazłem słowo #{word} : #{category}"
|
35
|
-
results.add(word,category)
|
36
|
-
end
|
37
|
-
end
|
38
|
-
|
39
|
-
results.total_words += 1
|
40
|
-
scanner.next(:word)
|
41
|
-
end
|
42
|
-
|
43
|
-
results
|
44
|
-
|
45
|
-
end
|
46
|
-
end
|
47
|
-
end
|
48
|
-
|
49
|
-
require "rid_analyzer.rb"
|
50
|
-
require "liwc_analyzer.rb"
|
data/lib/category.rb
DELETED
@@ -1,27 +0,0 @@
|
|
1
|
-
module NLP
|
2
|
-
class Category
|
3
|
-
attr_reader :parent, :name
|
4
|
-
|
5
|
-
def initialize(name, parent = nil)
|
6
|
-
@parent = parent
|
7
|
-
@name = name.to_sym
|
8
|
-
end
|
9
|
-
|
10
|
-
def path
|
11
|
-
@parent ? (@parent.path + '/' + name.to_s) : name.to_s
|
12
|
-
end
|
13
|
-
|
14
|
-
def root
|
15
|
-
category = self
|
16
|
-
while category.parent != nil
|
17
|
-
category = category.parent
|
18
|
-
end
|
19
|
-
category.name
|
20
|
-
end
|
21
|
-
|
22
|
-
def to_s
|
23
|
-
"#{path.inspect}"
|
24
|
-
end
|
25
|
-
|
26
|
-
end
|
27
|
-
end
|