nlp 0.2.5 → 0.2.6
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/analyzer.rb +15 -47
- data/lib/category.rb +7 -8
- data/lib/dictionary.rb +30 -28
- data/lib/emoticon.rb +8 -8
- data/lib/inflectable.rb +58 -59
- data/lib/lemmatizer.rb +86 -82
- data/lib/liwc_analyzer.rb +68 -91
- data/lib/liwc_category.rb +42 -43
- data/lib/meaningable.rb +44 -51
- data/lib/nlp.rb +10 -0
- data/lib/rid_analyzer.rb +5 -69
- data/lib/rid_category.rb +5 -6
- data/lib/sentence.rb +19 -11
- data/lib/statistic.rb +55 -0
- data/lib/stdlib/ext/array.rb +7 -0
- data/lib/stree.rb +39 -39
- data/lib/takipi_web_service.rb +45 -45
- data/lib/text.rb +18 -17
- data/lib/token.rb +28 -25
- data/lib/token_scanner.rb +43 -55
- data/lib/word.rb +14 -14
- data/test/analyzer_test.rb +25 -0
- data/test/lemmatizer_test.rb +73 -0
- data/test/meaningable_test.rb +28 -0
- data/test/nlp_test_suite.rb +11 -0
- data/test/sentence_test.rb +26 -0
- data/test/text_test.rb +29 -0
- data/test/token_scanner_test.rb +28 -0
- data/test/token_test.rb +37 -0
- data/test/word_test.rb +39 -36
- metadata +21 -5
- data/lib/takipi_web_service +0 -0
data/lib/liwc_analyzer.rb
CHANGED
@@ -1,97 +1,74 @@
|
|
1
1
|
module NLP
|
2
|
-
class LIWCAnalyzer < Analyzer
|
3
|
-
|
4
|
-
def initialize(
|
5
|
-
|
6
|
-
if restore
|
7
|
-
@dictionary = Dictionary.restore(state_file)
|
8
|
-
else
|
9
|
-
@dictionary = Dictionary.new
|
10
|
-
@dictionary.load_categories( category_file, :rid => false )
|
11
|
-
@dictionary.store(state_file)
|
12
|
-
end
|
13
|
-
|
2
|
+
class LIWCAnalyzer < Analyzer
|
3
|
+
|
4
|
+
def initialize(dicts)
|
5
|
+
@dictionary = Dictionary.new(:liwc)
|
14
6
|
end
|
15
7
|
|
16
8
|
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
end
|
82
|
-
|
83
|
-
results[:przyimki].push word if token.przyimek?
|
84
|
-
results[:numbers].push token.orth if token.number? or token.liczebnik?
|
85
|
-
|
86
|
-
|
87
|
-
results[:word_total] += 1
|
88
|
-
scanner.next(:alphanum)
|
89
|
-
end
|
90
|
-
results
|
91
|
-
|
92
|
-
end
|
93
|
-
|
94
|
-
|
95
|
-
end
|
9
|
+
def analyze(scanner)
|
10
|
+
|
11
|
+
results = Statistic.new
|
12
|
+
results.hash = {
|
13
|
+
:long_words => [],
|
14
|
+
:zaimki => [],
|
15
|
+
:zaimki1 => [],
|
16
|
+
:zaimki2 => [],
|
17
|
+
:zaimki3 => [],
|
18
|
+
:przyimki => [],
|
19
|
+
:numbers => [],
|
20
|
+
:emotion => [],
|
21
|
+
:social => [],
|
22
|
+
:personal => [],
|
23
|
+
:posemotion => [],
|
24
|
+
:negemotion => [],
|
25
|
+
:wulgar => [],
|
26
|
+
:cognitive => []
|
27
|
+
}
|
28
|
+
|
29
|
+
while token = scanner.current
|
30
|
+
word = token.lemat
|
31
|
+
|
32
|
+
categories = @dictionary.find(word.gsub( /[^\w-]/, "" ))
|
33
|
+
unless categories.nil?
|
34
|
+
categories.each do |category|
|
35
|
+
puts "Znalazłem słowo #{word} : #{category} root: #{category.root}"
|
36
|
+
token.category = category
|
37
|
+
results.add(word,category)
|
38
|
+
|
39
|
+
|
40
|
+
results[:emotion].push token.orth if token.emotion?
|
41
|
+
results[:social].push token.orth if token.social?
|
42
|
+
results[:personal].push token.orth if token.personal?
|
43
|
+
results[:wulgar].push token.orth if token.bad_word?
|
44
|
+
results[:cognitive].push token.orth if token.cognitive?
|
45
|
+
|
46
|
+
results[:posemotion].push token.orth if token.positive_emotion?
|
47
|
+
results[:negemotion].push token.orth if token.negative_emotion?
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
#words longer than 10
|
52
|
+
results[:long_words].push word if word.jlength > 10
|
53
|
+
if token.zaimek?
|
54
|
+
results[:zaimki].push word
|
55
|
+
|
56
|
+
results[:zaimki1].push token.orth if word === 'ja' or word === 'my'
|
57
|
+
results[:zaimki2].push token.orth if word === 'ty' or word === 'wy'
|
58
|
+
results[:zaimki3].push token.orth if word === 'on'
|
59
|
+
end
|
60
|
+
|
61
|
+
results[:przyimki].push word if token.przyimek?
|
62
|
+
results[:numbers].push token.orth if token.number? or token.liczebnik?
|
63
|
+
|
64
|
+
|
65
|
+
results.total_words += 1
|
66
|
+
scanner.next(:alphanum)
|
67
|
+
end
|
68
|
+
results
|
69
|
+
|
70
|
+
end
|
71
|
+
|
72
|
+
end
|
96
73
|
|
97
74
|
end
|
data/lib/liwc_category.rb
CHANGED
@@ -1,62 +1,61 @@
|
|
1
1
|
module NLP
|
2
2
|
|
3
|
-
|
3
|
+
class LIWCCategory < Category
|
4
4
|
|
5
|
-
|
6
|
-
|
7
|
-
def linguistic?
|
8
|
-
root == :PIERWOTNE
|
9
|
-
end
|
10
|
-
|
11
|
-
def psychological?
|
12
|
-
root == :PROCESY_PSYCHOLOGICZNE
|
13
|
-
end
|
5
|
+
#primary categories
|
14
6
|
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
7
|
+
def linguistic?
|
8
|
+
root == :PIERWOTNE
|
9
|
+
end
|
10
|
+
|
11
|
+
def psychological?
|
12
|
+
root == :PROCESY_PSYCHOLOGICZNE
|
13
|
+
end
|
14
|
+
|
15
|
+
|
16
|
+
def relative?
|
17
|
+
root === :RELATYWNOSC
|
18
|
+
end
|
19
|
+
|
20
|
+
def personal?
|
21
|
+
root == :OSOBISTE
|
22
|
+
end
|
23
23
|
|
24
|
-
|
25
|
-
|
26
|
-
def emotion?
|
27
|
-
path.include? 'EMOCJE'
|
24
|
+
#second categories
|
28
25
|
|
29
|
-
|
26
|
+
def emotion?
|
27
|
+
path.include? 'EMOCJE'
|
30
28
|
|
31
|
-
|
32
|
-
path.include? 'POZYTYWNE_EMOCJE'
|
33
|
-
|
34
|
-
end
|
29
|
+
end
|
35
30
|
|
36
|
-
|
37
|
-
|
31
|
+
def positive_emotion?
|
32
|
+
path.include? 'POZYTYWNE_EMOCJE'
|
38
33
|
|
39
|
-
|
34
|
+
end
|
40
35
|
|
41
|
-
|
42
|
-
|
36
|
+
def negative_emotion?
|
37
|
+
path.include? 'NEGATYWNE_EMOCJE'
|
43
38
|
|
44
|
-
|
39
|
+
end
|
45
40
|
|
46
|
-
|
47
|
-
|
48
|
-
end
|
41
|
+
def cognitive?
|
42
|
+
path.include? 'KOGNITYWNE_PROCESY'
|
49
43
|
|
50
|
-
|
51
|
-
path.include? 'SOCIAL'
|
44
|
+
end
|
52
45
|
|
53
|
-
|
46
|
+
def sense?
|
47
|
+
path.include? 'ZMYSLY'
|
48
|
+
end
|
54
49
|
|
55
|
-
|
56
|
-
|
57
|
-
end
|
50
|
+
def social?
|
51
|
+
path.include? 'SOCIAL'
|
58
52
|
|
53
|
+
end
|
59
54
|
|
60
|
-
|
55
|
+
def bad_word?
|
56
|
+
path.include? 'WULGAR'
|
61
57
|
end
|
58
|
+
|
59
|
+
|
60
|
+
end
|
62
61
|
end
|
data/lib/meaningable.rb
CHANGED
@@ -1,76 +1,69 @@
|
|
1
1
|
module Meaningable
|
2
2
|
|
3
|
-
|
3
|
+
#LIWC
|
4
|
+
#primary categories
|
4
5
|
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
def linguistic?
|
9
|
-
category.root == :PIERWOTNE
|
10
|
-
end
|
11
|
-
|
12
|
-
def psychological?
|
13
|
-
category.root == :PROCESY_PSYCHOLOGICZNE
|
14
|
-
end
|
6
|
+
def linguistic?
|
7
|
+
category.root == :PIERWOTNE
|
8
|
+
end
|
15
9
|
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
end
|
20
|
-
|
21
|
-
def personal?
|
22
|
-
category.root == :OSOBISTE
|
23
|
-
end
|
10
|
+
def psychological?
|
11
|
+
category.root == :PROCESY_PSYCHOLOGICZNE
|
12
|
+
end
|
24
13
|
|
25
|
-
#second categories
|
26
|
-
|
27
|
-
def emotion?
|
28
|
-
category.path.include? 'EMOCJE'
|
29
14
|
|
30
|
-
|
15
|
+
def relative?
|
16
|
+
category.root === :RELATYWNOSC
|
17
|
+
end
|
31
18
|
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
end
|
19
|
+
def personal?
|
20
|
+
category.root == :OSOBISTE
|
21
|
+
end
|
36
22
|
|
37
|
-
|
38
|
-
category.path.include? 'NEGATYWNE_EMOCJE'
|
23
|
+
#second categories
|
39
24
|
|
40
|
-
|
25
|
+
def emotion?
|
26
|
+
category.path.include? 'EMOCJE'
|
41
27
|
|
42
|
-
|
43
|
-
category.path.include? 'KOGNITYWNE_PROCESY'
|
28
|
+
end
|
44
29
|
|
45
|
-
|
30
|
+
def positive_emotion?
|
31
|
+
category.path.include? 'POZYTYWNE_EMOCJE'
|
46
32
|
|
47
|
-
|
48
|
-
category.path.include? 'ZMYSLY'
|
49
|
-
end
|
33
|
+
end
|
50
34
|
|
51
|
-
|
52
|
-
|
35
|
+
def negative_emotion?
|
36
|
+
category.path.include? 'NEGATYWNE_EMOCJE'
|
53
37
|
|
54
|
-
|
38
|
+
end
|
55
39
|
|
56
|
-
|
57
|
-
|
58
|
-
end
|
40
|
+
def cognitive?
|
41
|
+
category.path.include? 'KOGNITYWNE_PROCESY'
|
59
42
|
|
43
|
+
end
|
60
44
|
|
45
|
+
def sense?
|
46
|
+
category.path.include? 'ZMYSLY'
|
47
|
+
end
|
61
48
|
|
49
|
+
def social?
|
50
|
+
category.path.include? 'SOCIAL'
|
62
51
|
|
52
|
+
end
|
63
53
|
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
end
|
54
|
+
def bad_word?
|
55
|
+
category.path.include? 'WULGAR'
|
56
|
+
end
|
68
57
|
|
69
|
-
def synonyms
|
70
58
|
|
71
|
-
|
72
|
-
|
59
|
+
#SEMANTIC
|
60
|
+
def synonym?(other)
|
61
|
+
|
62
|
+
end
|
63
|
+
|
64
|
+
def synonyms
|
65
|
+
|
66
|
+
end
|
73
67
|
|
74
68
|
|
75
|
-
|
76
69
|
end
|
data/lib/nlp.rb
CHANGED
data/lib/rid_analyzer.rb
CHANGED
@@ -1,74 +1,10 @@
|
|
1
1
|
module NLP
|
2
|
-
class RIDAnalyzer < NLP::Analyzer
|
3
|
-
|
4
|
-
|
5
|
-
def initialize( category_file, restore = true )
|
6
|
-
state_file = File.expand_path(Analyzer::CACHE_DIR+'.rid')
|
7
|
-
if restore
|
8
|
-
@dictionary = Dictionary.restore(state_file)
|
9
|
-
else
|
10
|
-
@dictionary = Dictionary.new
|
11
|
-
@dictionary.load_categories( category_file, :rid => true )
|
12
|
-
@dictionary.store(state_file)
|
13
|
-
end
|
14
|
-
|
15
|
-
end
|
16
|
-
|
17
|
-
|
18
|
-
def analyze(scanner)
|
19
|
-
results = {
|
20
|
-
:word_count => 0,
|
21
|
-
:word_total => 0,
|
22
|
-
:scores => Hash.new { 0 },
|
23
|
-
:words => [],
|
24
|
-
:cwords => Hash.new { nil }
|
25
|
-
}
|
26
|
-
|
27
|
-
while token = scanner.current
|
28
|
-
word = token.lemat
|
29
|
-
|
30
|
-
categories = @dictionary.find( word.gsub( /[^\w-]/, "" ) )
|
31
|
-
unless categories.nil?
|
32
|
-
categories.each do |category|
|
33
|
-
puts "Znalazłem słowo #{word} : #{category} root: #{category.root}"
|
34
|
-
results[:scores][category] = results[:scores][category] + 1
|
35
|
-
category = category.name
|
36
|
-
if results[:cwords][category].nil?
|
37
|
-
results[:cwords][category] = []
|
38
|
-
end
|
39
|
-
results[:cwords][category].push word
|
40
|
-
results[:word_count] += 1
|
41
|
-
results[:words].push word
|
42
|
-
end
|
43
|
-
|
44
|
-
|
45
|
-
end
|
46
|
-
|
47
|
-
results[:word_total] += 1
|
48
|
-
scanner.next(:word)
|
49
|
-
end
|
50
|
-
|
51
|
-
results[:sorted_scores] = results[:scores].to_a.sort_by { |result| -result[1] }
|
52
|
-
p primary_sum = results[:sorted_scores].select { |result| result[0].primary? }.inject( 0 ) { |count,result| count + result[1] }
|
53
|
-
p secondary_sum = results[:sorted_scores].select { |result| result[0].secondary? }.inject( 0 ) { |count,result| count + result[1] }
|
54
|
-
p emotion_sum = results[:sorted_scores].select { |result| result[0].emotions? }.inject( 0 ) { |count,result| count + result[1] }
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
results[:classes] = {
|
59
|
-
:primary => Float(primary_sum) / results[:word_count],
|
60
|
-
:secondary => Float(secondary_sum) / results[:word_count],
|
61
|
-
:emotions => Float(emotion_sum) / results[:word_count]
|
62
|
-
}
|
63
|
-
|
64
|
-
results
|
65
|
-
end
|
66
|
-
|
67
|
-
|
68
|
-
|
69
2
|
|
3
|
+
class RIDAnalyzer < Analyzer
|
70
4
|
|
5
|
+
def initialize
|
6
|
+
@dictionary = Dictionary.new(:rid)
|
71
7
|
end
|
72
|
-
|
73
|
-
|
8
|
+
|
9
|
+
end
|
74
10
|
end
|