nlp 0.2.5 → 0.2.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/analyzer.rb +15 -47
- data/lib/category.rb +7 -8
- data/lib/dictionary.rb +30 -28
- data/lib/emoticon.rb +8 -8
- data/lib/inflectable.rb +58 -59
- data/lib/lemmatizer.rb +86 -82
- data/lib/liwc_analyzer.rb +68 -91
- data/lib/liwc_category.rb +42 -43
- data/lib/meaningable.rb +44 -51
- data/lib/nlp.rb +10 -0
- data/lib/rid_analyzer.rb +5 -69
- data/lib/rid_category.rb +5 -6
- data/lib/sentence.rb +19 -11
- data/lib/statistic.rb +55 -0
- data/lib/stdlib/ext/array.rb +7 -0
- data/lib/stree.rb +39 -39
- data/lib/takipi_web_service.rb +45 -45
- data/lib/text.rb +18 -17
- data/lib/token.rb +28 -25
- data/lib/token_scanner.rb +43 -55
- data/lib/word.rb +14 -14
- data/test/analyzer_test.rb +25 -0
- data/test/lemmatizer_test.rb +73 -0
- data/test/meaningable_test.rb +28 -0
- data/test/nlp_test_suite.rb +11 -0
- data/test/sentence_test.rb +26 -0
- data/test/text_test.rb +29 -0
- data/test/token_scanner_test.rb +28 -0
- data/test/token_test.rb +37 -0
- data/test/word_test.rb +39 -36
- metadata +21 -5
- data/lib/takipi_web_service +0 -0
data/lib/liwc_analyzer.rb
CHANGED
@@ -1,97 +1,74 @@
|
|
1
1
|
module NLP
|
2
|
-
class LIWCAnalyzer < Analyzer
|
3
|
-
|
4
|
-
def initialize(
|
5
|
-
|
6
|
-
if restore
|
7
|
-
@dictionary = Dictionary.restore(state_file)
|
8
|
-
else
|
9
|
-
@dictionary = Dictionary.new
|
10
|
-
@dictionary.load_categories( category_file, :rid => false )
|
11
|
-
@dictionary.store(state_file)
|
12
|
-
end
|
13
|
-
|
2
|
+
class LIWCAnalyzer < Analyzer
|
3
|
+
|
4
|
+
def initialize(dicts)
|
5
|
+
@dictionary = Dictionary.new(:liwc)
|
14
6
|
end
|
15
7
|
|
16
8
|
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
end
|
82
|
-
|
83
|
-
results[:przyimki].push word if token.przyimek?
|
84
|
-
results[:numbers].push token.orth if token.number? or token.liczebnik?
|
85
|
-
|
86
|
-
|
87
|
-
results[:word_total] += 1
|
88
|
-
scanner.next(:alphanum)
|
89
|
-
end
|
90
|
-
results
|
91
|
-
|
92
|
-
end
|
93
|
-
|
94
|
-
|
95
|
-
end
|
9
|
+
def analyze(scanner)
|
10
|
+
|
11
|
+
results = Statistic.new
|
12
|
+
results.hash = {
|
13
|
+
:long_words => [],
|
14
|
+
:zaimki => [],
|
15
|
+
:zaimki1 => [],
|
16
|
+
:zaimki2 => [],
|
17
|
+
:zaimki3 => [],
|
18
|
+
:przyimki => [],
|
19
|
+
:numbers => [],
|
20
|
+
:emotion => [],
|
21
|
+
:social => [],
|
22
|
+
:personal => [],
|
23
|
+
:posemotion => [],
|
24
|
+
:negemotion => [],
|
25
|
+
:wulgar => [],
|
26
|
+
:cognitive => []
|
27
|
+
}
|
28
|
+
|
29
|
+
while token = scanner.current
|
30
|
+
word = token.lemat
|
31
|
+
|
32
|
+
categories = @dictionary.find(word.gsub( /[^\w-]/, "" ))
|
33
|
+
unless categories.nil?
|
34
|
+
categories.each do |category|
|
35
|
+
puts "Znalazłem słowo #{word} : #{category} root: #{category.root}"
|
36
|
+
token.category = category
|
37
|
+
results.add(word,category)
|
38
|
+
|
39
|
+
|
40
|
+
results[:emotion].push token.orth if token.emotion?
|
41
|
+
results[:social].push token.orth if token.social?
|
42
|
+
results[:personal].push token.orth if token.personal?
|
43
|
+
results[:wulgar].push token.orth if token.bad_word?
|
44
|
+
results[:cognitive].push token.orth if token.cognitive?
|
45
|
+
|
46
|
+
results[:posemotion].push token.orth if token.positive_emotion?
|
47
|
+
results[:negemotion].push token.orth if token.negative_emotion?
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
#words longer than 10
|
52
|
+
results[:long_words].push word if word.jlength > 10
|
53
|
+
if token.zaimek?
|
54
|
+
results[:zaimki].push word
|
55
|
+
|
56
|
+
results[:zaimki1].push token.orth if word === 'ja' or word === 'my'
|
57
|
+
results[:zaimki2].push token.orth if word === 'ty' or word === 'wy'
|
58
|
+
results[:zaimki3].push token.orth if word === 'on'
|
59
|
+
end
|
60
|
+
|
61
|
+
results[:przyimki].push word if token.przyimek?
|
62
|
+
results[:numbers].push token.orth if token.number? or token.liczebnik?
|
63
|
+
|
64
|
+
|
65
|
+
results.total_words += 1
|
66
|
+
scanner.next(:alphanum)
|
67
|
+
end
|
68
|
+
results
|
69
|
+
|
70
|
+
end
|
71
|
+
|
72
|
+
end
|
96
73
|
|
97
74
|
end
|
data/lib/liwc_category.rb
CHANGED
@@ -1,62 +1,61 @@
|
|
1
1
|
module NLP
|
2
2
|
|
3
|
-
|
3
|
+
class LIWCCategory < Category
|
4
4
|
|
5
|
-
|
6
|
-
|
7
|
-
def linguistic?
|
8
|
-
root == :PIERWOTNE
|
9
|
-
end
|
10
|
-
|
11
|
-
def psychological?
|
12
|
-
root == :PROCESY_PSYCHOLOGICZNE
|
13
|
-
end
|
5
|
+
#primary categories
|
14
6
|
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
7
|
+
def linguistic?
|
8
|
+
root == :PIERWOTNE
|
9
|
+
end
|
10
|
+
|
11
|
+
def psychological?
|
12
|
+
root == :PROCESY_PSYCHOLOGICZNE
|
13
|
+
end
|
14
|
+
|
15
|
+
|
16
|
+
def relative?
|
17
|
+
root === :RELATYWNOSC
|
18
|
+
end
|
19
|
+
|
20
|
+
def personal?
|
21
|
+
root == :OSOBISTE
|
22
|
+
end
|
23
23
|
|
24
|
-
|
25
|
-
|
26
|
-
def emotion?
|
27
|
-
path.include? 'EMOCJE'
|
24
|
+
#second categories
|
28
25
|
|
29
|
-
|
26
|
+
def emotion?
|
27
|
+
path.include? 'EMOCJE'
|
30
28
|
|
31
|
-
|
32
|
-
path.include? 'POZYTYWNE_EMOCJE'
|
33
|
-
|
34
|
-
end
|
29
|
+
end
|
35
30
|
|
36
|
-
|
37
|
-
|
31
|
+
def positive_emotion?
|
32
|
+
path.include? 'POZYTYWNE_EMOCJE'
|
38
33
|
|
39
|
-
|
34
|
+
end
|
40
35
|
|
41
|
-
|
42
|
-
|
36
|
+
def negative_emotion?
|
37
|
+
path.include? 'NEGATYWNE_EMOCJE'
|
43
38
|
|
44
|
-
|
39
|
+
end
|
45
40
|
|
46
|
-
|
47
|
-
|
48
|
-
end
|
41
|
+
def cognitive?
|
42
|
+
path.include? 'KOGNITYWNE_PROCESY'
|
49
43
|
|
50
|
-
|
51
|
-
path.include? 'SOCIAL'
|
44
|
+
end
|
52
45
|
|
53
|
-
|
46
|
+
def sense?
|
47
|
+
path.include? 'ZMYSLY'
|
48
|
+
end
|
54
49
|
|
55
|
-
|
56
|
-
|
57
|
-
end
|
50
|
+
def social?
|
51
|
+
path.include? 'SOCIAL'
|
58
52
|
|
53
|
+
end
|
59
54
|
|
60
|
-
|
55
|
+
def bad_word?
|
56
|
+
path.include? 'WULGAR'
|
61
57
|
end
|
58
|
+
|
59
|
+
|
60
|
+
end
|
62
61
|
end
|
data/lib/meaningable.rb
CHANGED
@@ -1,76 +1,69 @@
|
|
1
1
|
module Meaningable
|
2
2
|
|
3
|
-
|
3
|
+
#LIWC
|
4
|
+
#primary categories
|
4
5
|
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
def linguistic?
|
9
|
-
category.root == :PIERWOTNE
|
10
|
-
end
|
11
|
-
|
12
|
-
def psychological?
|
13
|
-
category.root == :PROCESY_PSYCHOLOGICZNE
|
14
|
-
end
|
6
|
+
def linguistic?
|
7
|
+
category.root == :PIERWOTNE
|
8
|
+
end
|
15
9
|
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
end
|
20
|
-
|
21
|
-
def personal?
|
22
|
-
category.root == :OSOBISTE
|
23
|
-
end
|
10
|
+
def psychological?
|
11
|
+
category.root == :PROCESY_PSYCHOLOGICZNE
|
12
|
+
end
|
24
13
|
|
25
|
-
#second categories
|
26
|
-
|
27
|
-
def emotion?
|
28
|
-
category.path.include? 'EMOCJE'
|
29
14
|
|
30
|
-
|
15
|
+
def relative?
|
16
|
+
category.root === :RELATYWNOSC
|
17
|
+
end
|
31
18
|
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
end
|
19
|
+
def personal?
|
20
|
+
category.root == :OSOBISTE
|
21
|
+
end
|
36
22
|
|
37
|
-
|
38
|
-
category.path.include? 'NEGATYWNE_EMOCJE'
|
23
|
+
#second categories
|
39
24
|
|
40
|
-
|
25
|
+
def emotion?
|
26
|
+
category.path.include? 'EMOCJE'
|
41
27
|
|
42
|
-
|
43
|
-
category.path.include? 'KOGNITYWNE_PROCESY'
|
28
|
+
end
|
44
29
|
|
45
|
-
|
30
|
+
def positive_emotion?
|
31
|
+
category.path.include? 'POZYTYWNE_EMOCJE'
|
46
32
|
|
47
|
-
|
48
|
-
category.path.include? 'ZMYSLY'
|
49
|
-
end
|
33
|
+
end
|
50
34
|
|
51
|
-
|
52
|
-
|
35
|
+
def negative_emotion?
|
36
|
+
category.path.include? 'NEGATYWNE_EMOCJE'
|
53
37
|
|
54
|
-
|
38
|
+
end
|
55
39
|
|
56
|
-
|
57
|
-
|
58
|
-
end
|
40
|
+
def cognitive?
|
41
|
+
category.path.include? 'KOGNITYWNE_PROCESY'
|
59
42
|
|
43
|
+
end
|
60
44
|
|
45
|
+
def sense?
|
46
|
+
category.path.include? 'ZMYSLY'
|
47
|
+
end
|
61
48
|
|
49
|
+
def social?
|
50
|
+
category.path.include? 'SOCIAL'
|
62
51
|
|
52
|
+
end
|
63
53
|
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
end
|
54
|
+
def bad_word?
|
55
|
+
category.path.include? 'WULGAR'
|
56
|
+
end
|
68
57
|
|
69
|
-
def synonyms
|
70
58
|
|
71
|
-
|
72
|
-
|
59
|
+
#SEMANTIC
|
60
|
+
def synonym?(other)
|
61
|
+
|
62
|
+
end
|
63
|
+
|
64
|
+
def synonyms
|
65
|
+
|
66
|
+
end
|
73
67
|
|
74
68
|
|
75
|
-
|
76
69
|
end
|
data/lib/nlp.rb
CHANGED
data/lib/rid_analyzer.rb
CHANGED
@@ -1,74 +1,10 @@
|
|
1
1
|
module NLP
|
2
|
-
class RIDAnalyzer < NLP::Analyzer
|
3
|
-
|
4
|
-
|
5
|
-
def initialize( category_file, restore = true )
|
6
|
-
state_file = File.expand_path(Analyzer::CACHE_DIR+'.rid')
|
7
|
-
if restore
|
8
|
-
@dictionary = Dictionary.restore(state_file)
|
9
|
-
else
|
10
|
-
@dictionary = Dictionary.new
|
11
|
-
@dictionary.load_categories( category_file, :rid => true )
|
12
|
-
@dictionary.store(state_file)
|
13
|
-
end
|
14
|
-
|
15
|
-
end
|
16
|
-
|
17
|
-
|
18
|
-
def analyze(scanner)
|
19
|
-
results = {
|
20
|
-
:word_count => 0,
|
21
|
-
:word_total => 0,
|
22
|
-
:scores => Hash.new { 0 },
|
23
|
-
:words => [],
|
24
|
-
:cwords => Hash.new { nil }
|
25
|
-
}
|
26
|
-
|
27
|
-
while token = scanner.current
|
28
|
-
word = token.lemat
|
29
|
-
|
30
|
-
categories = @dictionary.find( word.gsub( /[^\w-]/, "" ) )
|
31
|
-
unless categories.nil?
|
32
|
-
categories.each do |category|
|
33
|
-
puts "Znalazłem słowo #{word} : #{category} root: #{category.root}"
|
34
|
-
results[:scores][category] = results[:scores][category] + 1
|
35
|
-
category = category.name
|
36
|
-
if results[:cwords][category].nil?
|
37
|
-
results[:cwords][category] = []
|
38
|
-
end
|
39
|
-
results[:cwords][category].push word
|
40
|
-
results[:word_count] += 1
|
41
|
-
results[:words].push word
|
42
|
-
end
|
43
|
-
|
44
|
-
|
45
|
-
end
|
46
|
-
|
47
|
-
results[:word_total] += 1
|
48
|
-
scanner.next(:word)
|
49
|
-
end
|
50
|
-
|
51
|
-
results[:sorted_scores] = results[:scores].to_a.sort_by { |result| -result[1] }
|
52
|
-
p primary_sum = results[:sorted_scores].select { |result| result[0].primary? }.inject( 0 ) { |count,result| count + result[1] }
|
53
|
-
p secondary_sum = results[:sorted_scores].select { |result| result[0].secondary? }.inject( 0 ) { |count,result| count + result[1] }
|
54
|
-
p emotion_sum = results[:sorted_scores].select { |result| result[0].emotions? }.inject( 0 ) { |count,result| count + result[1] }
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
results[:classes] = {
|
59
|
-
:primary => Float(primary_sum) / results[:word_count],
|
60
|
-
:secondary => Float(secondary_sum) / results[:word_count],
|
61
|
-
:emotions => Float(emotion_sum) / results[:word_count]
|
62
|
-
}
|
63
|
-
|
64
|
-
results
|
65
|
-
end
|
66
|
-
|
67
|
-
|
68
|
-
|
69
2
|
|
3
|
+
class RIDAnalyzer < Analyzer
|
70
4
|
|
5
|
+
def initialize
|
6
|
+
@dictionary = Dictionary.new(:rid)
|
71
7
|
end
|
72
|
-
|
73
|
-
|
8
|
+
|
9
|
+
end
|
74
10
|
end
|