nlp 0.2.3 → 0.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/dict/liwc +3486 -0
- data/dict/rid +2 -3
- data/lib/analyzer.rb +7 -4
- data/lib/dictionary.rb +17 -3
- data/lib/lemmatizer.rb +1 -1
- data/lib/liwc_analyzer.rb +88 -2
- data/lib/liwc_category.rb +56 -1
- data/lib/meaningable.rb +44 -23
- data/lib/rid_analyzer.rb +24 -2
- data/lib/sentence.rb +4 -0
- data/lib/text.rb +25 -0
- data/lib/token_scanner.rb +10 -3
- data/lib/word.rb +2 -0
- metadata +6 -4
data/dict/rid
CHANGED
|
@@ -234,8 +234,7 @@ PIERWOTNE
|
|
|
234
234
|
defakacja
|
|
235
235
|
dupa
|
|
236
236
|
dupek
|
|
237
|
-
|
|
238
|
-
enema
|
|
237
|
+
dupsko
|
|
239
238
|
fetor
|
|
240
239
|
gazy
|
|
241
240
|
gnić
|
|
@@ -1018,7 +1017,7 @@ PIERWOTNE
|
|
|
1018
1017
|
potrząsać
|
|
1019
1018
|
przejażdżka
|
|
1020
1019
|
przenosić się
|
|
1021
|
-
|
|
1020
|
+
PRZYPADKOWE_RUCHY
|
|
1022
1021
|
puchnąć
|
|
1023
1022
|
pulsować
|
|
1024
1023
|
pustynia
|
data/lib/analyzer.rb
CHANGED
|
@@ -2,25 +2,28 @@ require 'dictionary'
|
|
|
2
2
|
#require 'morfeusz'
|
|
3
3
|
require 'token'
|
|
4
4
|
require 'word'
|
|
5
|
+
require 'token'
|
|
6
|
+
require 'text'
|
|
5
7
|
require 'emoticon'
|
|
6
8
|
require 'sentence'
|
|
7
9
|
require "token_scanner.rb"
|
|
8
10
|
require "lemmatizer"
|
|
9
|
-
|
|
11
|
+
require 'jcode'
|
|
10
12
|
$KODE = "UTF8"
|
|
11
13
|
|
|
12
14
|
module NLP
|
|
13
15
|
|
|
14
16
|
class Analyzer
|
|
15
17
|
|
|
18
|
+
CACHE_DIR = '~/'
|
|
16
19
|
|
|
17
20
|
def initialize( category_file, restore = true )
|
|
18
|
-
state_file = File.expand_path(
|
|
21
|
+
state_file = File.expand_path(Analyzer::CACHE_DIR)
|
|
19
22
|
if restore
|
|
20
23
|
@dictionary = Dictionary.restore(state_file)
|
|
21
24
|
else
|
|
22
25
|
@dictionary = Dictionary.new
|
|
23
|
-
@dictionary.load_categories( category_file )
|
|
26
|
+
@dictionary.load_categories( category_file, :rid => true )
|
|
24
27
|
@dictionary.store(state_file)
|
|
25
28
|
end
|
|
26
29
|
|
|
@@ -33,7 +36,7 @@ module NLP
|
|
|
33
36
|
:word_count => 0,
|
|
34
37
|
:word_total => 0,
|
|
35
38
|
:scores => Hash.new { 0 },
|
|
36
|
-
:words => []
|
|
39
|
+
:words => []
|
|
37
40
|
}
|
|
38
41
|
|
|
39
42
|
|
data/lib/dictionary.rb
CHANGED
|
@@ -2,10 +2,10 @@
|
|
|
2
2
|
require 'stree'
|
|
3
3
|
require 'category'
|
|
4
4
|
require 'rid_category'
|
|
5
|
+
require 'liwc_category'
|
|
5
6
|
|
|
6
7
|
module NLP
|
|
7
8
|
class Dictionary
|
|
8
|
-
CACHE_DIR = '~/.rima'
|
|
9
9
|
def initialize
|
|
10
10
|
@tree = SearchTree.new
|
|
11
11
|
@categories = {}
|
|
@@ -35,7 +35,7 @@ module NLP
|
|
|
35
35
|
end
|
|
36
36
|
|
|
37
37
|
|
|
38
|
-
def load_categories( category_file )
|
|
38
|
+
def load_categories( category_file,options )
|
|
39
39
|
category = nil
|
|
40
40
|
primary = nil
|
|
41
41
|
secondary = nil
|
|
@@ -47,13 +47,27 @@ module NLP
|
|
|
47
47
|
begin
|
|
48
48
|
lead, rest = line.scan( /(\t*)(.*)/ ).first
|
|
49
49
|
if lead.size == 0
|
|
50
|
+
if options[:rid]
|
|
50
51
|
category = primary = RIDCategory.new( rest )
|
|
52
|
+
else
|
|
53
|
+
category = primary = LIWCCategory.new( rest )
|
|
54
|
+
end
|
|
55
|
+
|
|
51
56
|
secondary, tertiary = nil
|
|
52
57
|
elsif lead.size == 1
|
|
58
|
+
if options[:rid]
|
|
53
59
|
category = secondary = RIDCategory.new( rest, primary )
|
|
60
|
+
else
|
|
61
|
+
category = secondary = LIWCCategory.new(rest,primary)
|
|
62
|
+
end
|
|
54
63
|
tertiary = nil
|
|
55
|
-
elsif lead.size == 2 && ( cat = line.strip.index(/^[A-ZĄŚĘĆŃŹŻŁÓ]+$/)) && cat >= 0
|
|
64
|
+
elsif lead.size == 2 && ( cat = line.strip.index(/^[A-ZĄŚĘĆŃŹŻŁÓ_]+$/)) && cat >= 0
|
|
65
|
+
if options[:rid]
|
|
66
|
+
|
|
56
67
|
category = tertiary = RIDCategory.new( rest, secondary )
|
|
68
|
+
else
|
|
69
|
+
category = tertiary = LIWCCategory.new( rest, secondary )
|
|
70
|
+
end
|
|
57
71
|
else
|
|
58
72
|
word = rest.downcase.gsub( /\s*\(1\)$/, '' )
|
|
59
73
|
@tree.insert( word, category )
|
data/lib/lemmatizer.rb
CHANGED
data/lib/liwc_analyzer.rb
CHANGED
|
@@ -1,8 +1,94 @@
|
|
|
1
1
|
module NLP
|
|
2
|
-
class
|
|
2
|
+
class LIWCAnalyzer < Analyzer
|
|
3
|
+
|
|
4
|
+
def initialize( category_file, restore = true )
|
|
5
|
+
state_file = File.expand_path(Analyzer::CACHE_DIR+'.liwc')
|
|
6
|
+
if restore
|
|
7
|
+
@dictionary = Dictionary.restore(state_file)
|
|
8
|
+
else
|
|
9
|
+
@dictionary = Dictionary.new
|
|
10
|
+
@dictionary.load_categories( category_file, :rid => false )
|
|
11
|
+
@dictionary.store(state_file)
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
|
|
3
17
|
|
|
4
|
-
def analyze
|
|
18
|
+
def analyze(scanner)
|
|
19
|
+
|
|
20
|
+
results = {
|
|
21
|
+
:word_count => 0,
|
|
22
|
+
:word_total => 0,
|
|
23
|
+
:scores => Hash.new { 0 },
|
|
24
|
+
:words => [],
|
|
25
|
+
:cwords => Hash.new { nil },
|
|
26
|
+
:long_words => [],
|
|
27
|
+
:zaimki => [],
|
|
28
|
+
:zaimki1 => [],
|
|
29
|
+
:zaimki2 => [],
|
|
30
|
+
:zaimki3 => [],
|
|
31
|
+
:przyimki => [],
|
|
32
|
+
:numbers => [],
|
|
33
|
+
:emotion => [],
|
|
34
|
+
:social => [],
|
|
35
|
+
:personal => [],
|
|
36
|
+
:posemotion => [],
|
|
37
|
+
:negemotion => [],
|
|
38
|
+
:wulgar => [],
|
|
39
|
+
:cognitive => []
|
|
40
|
+
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
while token = scanner.current
|
|
44
|
+
word = token.lemat
|
|
45
|
+
|
|
46
|
+
categories = @dictionary.find( word.gsub( /[^\w-]/, "" ) )
|
|
47
|
+
unless categories.nil?
|
|
48
|
+
categories.each do |category|
|
|
49
|
+
puts "Znalazłem słowo #{word} : #{category} root: #{category.root}"
|
|
50
|
+
token.category = category
|
|
51
|
+
results[:scores][category] = results[:scores][category] + 1
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
if results[:cwords][category.name].nil?
|
|
55
|
+
results[:cwords][category.name] = []
|
|
56
|
+
end
|
|
57
|
+
results[:cwords][category.name].push token.orth
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
results[:emotion].push token.orth if token.emotion?
|
|
61
|
+
results[:social].push token.orth if token.social?
|
|
62
|
+
results[:personal].push token.orth if token.personal?
|
|
63
|
+
results[:wulgar].push token.orth if token.bad_word?
|
|
64
|
+
results[:cognitive].push token.orth if token.cognitive?
|
|
65
|
+
|
|
66
|
+
results[:posemotion].push token.orth if token.positive_emotion?
|
|
67
|
+
results[:negemotion].push token.orth if token.negative_emotion?
|
|
68
|
+
results[:word_count] += 1
|
|
69
|
+
results[:words].push word
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
#words longer than 9
|
|
74
|
+
results[:long_words].push word if word.jlength > 9
|
|
75
|
+
if token.zaimek?
|
|
76
|
+
results[:zaimki].push word
|
|
77
|
+
|
|
78
|
+
results[:zaimki1].push token.orth if word === 'ja' or word === 'my'
|
|
79
|
+
results[:zaimki2].push token.orth if word === 'ty' or word === 'wy'
|
|
80
|
+
results[:zaimki3].push token.orth if word === 'on'
|
|
81
|
+
end
|
|
5
82
|
|
|
83
|
+
results[:przyimki].push word if token.przyimek?
|
|
84
|
+
results[:numbers].push token.orth if token.number? or token.liczebnik?
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
results[:word_total] += 1
|
|
88
|
+
scanner.next(:alphanum)
|
|
89
|
+
end
|
|
90
|
+
results
|
|
91
|
+
|
|
6
92
|
end
|
|
7
93
|
|
|
8
94
|
|
data/lib/liwc_category.rb
CHANGED
|
@@ -1,7 +1,62 @@
|
|
|
1
1
|
module NLP
|
|
2
2
|
|
|
3
3
|
class LIWCCategory < Category
|
|
4
|
-
|
|
4
|
+
|
|
5
|
+
#primary categories
|
|
6
|
+
|
|
7
|
+
def linguistic?
|
|
8
|
+
root == :PIERWOTNE
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def psychological?
|
|
12
|
+
root == :PROCESY_PSYCHOLOGICZNE
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def relative?
|
|
17
|
+
root === :RELATYWNOSC
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def personal?
|
|
21
|
+
root == :OSOBISTE
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
#second categories
|
|
25
|
+
|
|
26
|
+
def emotion?
|
|
27
|
+
path.include? 'EMOCJE'
|
|
28
|
+
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def positive_emotion?
|
|
32
|
+
path.include? 'POZYTYWNE_EMOCJE'
|
|
33
|
+
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def negative_emotion?
|
|
37
|
+
path.include? 'NEGATYWNE_EMOCJE'
|
|
38
|
+
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def cognitive?
|
|
42
|
+
path.include? 'KOGNITYWNE_PROCESY'
|
|
43
|
+
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def sense?
|
|
47
|
+
path.include? 'ZMYSLY'
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def social?
|
|
51
|
+
path.include? 'SOCIAL'
|
|
52
|
+
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def bad_word?
|
|
56
|
+
path.include? 'WULGAR'
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
|
|
5
60
|
|
|
6
61
|
end
|
|
7
62
|
end
|
data/lib/meaningable.rb
CHANGED
|
@@ -1,44 +1,65 @@
|
|
|
1
1
|
module Meaningable
|
|
2
2
|
|
|
3
|
+
|
|
4
|
+
|
|
3
5
|
#LIWC
|
|
4
|
-
|
|
6
|
+
#primary categories
|
|
7
|
+
|
|
8
|
+
def linguistic?
|
|
9
|
+
category.root == :PIERWOTNE
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def psychological?
|
|
13
|
+
category.root == :PROCESY_PSYCHOLOGICZNE
|
|
14
|
+
end
|
|
5
15
|
|
|
6
|
-
|
|
16
|
+
|
|
17
|
+
def relative?
|
|
18
|
+
category.root === :RELATYWNOSC
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def personal?
|
|
22
|
+
category.root == :OSOBISTE
|
|
23
|
+
end
|
|
7
24
|
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
25
|
+
#second categories
|
|
26
|
+
|
|
27
|
+
def emotion?
|
|
28
|
+
category.path.include? 'EMOCJE'
|
|
11
29
|
|
|
12
|
-
|
|
30
|
+
end
|
|
13
31
|
|
|
14
|
-
|
|
32
|
+
def positive_emotion?
|
|
33
|
+
category.path.include? 'POZYTYWNE_EMOCJE'
|
|
34
|
+
|
|
35
|
+
end
|
|
15
36
|
|
|
16
|
-
|
|
17
|
-
|
|
37
|
+
def negative_emotion?
|
|
38
|
+
category.path.include? 'NEGATYWNE_EMOCJE'
|
|
18
39
|
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
end
|
|
22
|
-
|
|
23
|
-
#EXPERIMENTAl
|
|
40
|
+
end
|
|
24
41
|
|
|
25
|
-
|
|
42
|
+
def cognitive?
|
|
43
|
+
category.path.include? 'KOGNITYWNE_PROCESY'
|
|
26
44
|
|
|
27
|
-
|
|
45
|
+
end
|
|
28
46
|
|
|
47
|
+
def sense?
|
|
48
|
+
category.path.include? 'ZMYSLY'
|
|
49
|
+
end
|
|
29
50
|
|
|
30
|
-
|
|
51
|
+
def social?
|
|
52
|
+
category.path.include? 'SOCIAL'
|
|
31
53
|
|
|
32
|
-
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def bad_word?
|
|
57
|
+
category.path.include? 'WULGAR'
|
|
58
|
+
end
|
|
33
59
|
|
|
34
60
|
|
|
35
|
-
def filler?
|
|
36
61
|
|
|
37
|
-
end
|
|
38
62
|
|
|
39
|
-
def nonfluent?
|
|
40
|
-
|
|
41
|
-
end
|
|
42
63
|
|
|
43
64
|
#SEMANTIC
|
|
44
65
|
def synonym?(other)
|
data/lib/rid_analyzer.rb
CHANGED
|
@@ -1,12 +1,27 @@
|
|
|
1
1
|
module NLP
|
|
2
2
|
class RIDAnalyzer < NLP::Analyzer
|
|
3
3
|
|
|
4
|
+
|
|
5
|
+
def initialize( category_file, restore = true )
|
|
6
|
+
state_file = File.expand_path(Analyzer::CACHE_DIR+'.rid')
|
|
7
|
+
if restore
|
|
8
|
+
@dictionary = Dictionary.restore(state_file)
|
|
9
|
+
else
|
|
10
|
+
@dictionary = Dictionary.new
|
|
11
|
+
@dictionary.load_categories( category_file, :rid => true )
|
|
12
|
+
@dictionary.store(state_file)
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
|
|
4
18
|
def analyze(scanner)
|
|
5
19
|
results = {
|
|
6
20
|
:word_count => 0,
|
|
7
21
|
:word_total => 0,
|
|
8
22
|
:scores => Hash.new { 0 },
|
|
9
|
-
:words => []
|
|
23
|
+
:words => [],
|
|
24
|
+
:cwords => Hash.new { nil }
|
|
10
25
|
}
|
|
11
26
|
|
|
12
27
|
while token = scanner.current
|
|
@@ -15,8 +30,13 @@ module NLP
|
|
|
15
30
|
categories = @dictionary.find( word.gsub( /[^\w-]/, "" ) )
|
|
16
31
|
unless categories.nil?
|
|
17
32
|
categories.each do |category|
|
|
18
|
-
puts "Znalazłem słowo #{word} : #{category}"
|
|
33
|
+
puts "Znalazłem słowo #{word} : #{category} root: #{category.root}"
|
|
19
34
|
results[:scores][category] = results[:scores][category] + 1
|
|
35
|
+
category = category.name
|
|
36
|
+
if results[:cwords][category].nil?
|
|
37
|
+
results[:cwords][category] = []
|
|
38
|
+
end
|
|
39
|
+
results[:cwords][category].push word
|
|
20
40
|
results[:word_count] += 1
|
|
21
41
|
results[:words].push word
|
|
22
42
|
end
|
|
@@ -46,6 +66,8 @@ module NLP
|
|
|
46
66
|
|
|
47
67
|
|
|
48
68
|
|
|
69
|
+
|
|
70
|
+
|
|
49
71
|
end
|
|
50
72
|
|
|
51
73
|
|
data/lib/sentence.rb
CHANGED
data/lib/text.rb
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
module NLP
|
|
2
|
+
class Text
|
|
3
|
+
attr_reader :sentences
|
|
4
|
+
|
|
5
|
+
def initialize
|
|
6
|
+
@sentences = []
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
def << sentence
|
|
10
|
+
@sentences.push sentence
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def words_per_sentence
|
|
15
|
+
mean(@sentences.collect{|s| s.words_number})
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
private
|
|
19
|
+
def mean(x)
|
|
20
|
+
sum=0
|
|
21
|
+
x.each{|v| sum+=v }
|
|
22
|
+
sum/x.size
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
end
|