nlp 0.2.3 → 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/dict/rid CHANGED
@@ -234,8 +234,7 @@ PIERWOTNE
234
234
  defakacja
235
235
  dupa
236
236
  dupek
237
- dupsko
238
- enema
237
+ dupsko
239
238
  fetor
240
239
  gazy
241
240
  gnić
@@ -1018,7 +1017,7 @@ PIERWOTNE
1018
1017
  potrząsać
1019
1018
  przejażdżka
1020
1019
  przenosić się
1021
- PRZYPADKOWE RUCHY
1020
+ PRZYPADKOWE_RUCHY
1022
1021
  puchnąć
1023
1022
  pulsować
1024
1023
  pustynia
data/lib/analyzer.rb CHANGED
@@ -2,25 +2,28 @@ require 'dictionary'
2
2
  #require 'morfeusz'
3
3
  require 'token'
4
4
  require 'word'
5
+ require 'token'
6
+ require 'text'
5
7
  require 'emoticon'
6
8
  require 'sentence'
7
9
  require "token_scanner.rb"
8
10
  require "lemmatizer"
9
-
11
+ require 'jcode'
10
12
  $KODE = "UTF8"
11
13
 
12
14
  module NLP
13
15
 
14
16
  class Analyzer
15
17
 
18
+ CACHE_DIR = '~/'
16
19
 
17
20
  def initialize( category_file, restore = true )
18
- state_file = File.expand_path(Dictionary::CACHE_DIR)
21
+ state_file = File.expand_path(Analyzer::CACHE_DIR)
19
22
  if restore
20
23
  @dictionary = Dictionary.restore(state_file)
21
24
  else
22
25
  @dictionary = Dictionary.new
23
- @dictionary.load_categories( category_file )
26
+ @dictionary.load_categories( category_file, :rid => true )
24
27
  @dictionary.store(state_file)
25
28
  end
26
29
 
@@ -33,7 +36,7 @@ module NLP
33
36
  :word_count => 0,
34
37
  :word_total => 0,
35
38
  :scores => Hash.new { 0 },
36
- :words => []
39
+ :words => []
37
40
  }
38
41
 
39
42
 
data/lib/dictionary.rb CHANGED
@@ -2,10 +2,10 @@
2
2
  require 'stree'
3
3
  require 'category'
4
4
  require 'rid_category'
5
+ require 'liwc_category'
5
6
 
6
7
  module NLP
7
8
  class Dictionary
8
- CACHE_DIR = '~/.rima'
9
9
  def initialize
10
10
  @tree = SearchTree.new
11
11
  @categories = {}
@@ -35,7 +35,7 @@ module NLP
35
35
  end
36
36
 
37
37
 
38
- def load_categories( category_file )
38
+ def load_categories( category_file,options )
39
39
  category = nil
40
40
  primary = nil
41
41
  secondary = nil
@@ -47,13 +47,27 @@ module NLP
47
47
  begin
48
48
  lead, rest = line.scan( /(\t*)(.*)/ ).first
49
49
  if lead.size == 0
50
+ if options[:rid]
50
51
  category = primary = RIDCategory.new( rest )
52
+ else
53
+ category = primary = LIWCCategory.new( rest )
54
+ end
55
+
51
56
  secondary, tertiary = nil
52
57
  elsif lead.size == 1
58
+ if options[:rid]
53
59
  category = secondary = RIDCategory.new( rest, primary )
60
+ else
61
+ category = secondary = LIWCCategory.new(rest,primary)
62
+ end
54
63
  tertiary = nil
55
- elsif lead.size == 2 && ( cat = line.strip.index(/^[A-ZĄŚĘĆŃŹŻŁÓ]+$/)) && cat >= 0
64
+ elsif lead.size == 2 && ( cat = line.strip.index(/^[A-ZĄŚĘĆŃŹŻŁÓ_]+$/)) && cat >= 0
65
+ if options[:rid]
66
+
56
67
  category = tertiary = RIDCategory.new( rest, secondary )
68
+ else
69
+ category = tertiary = LIWCCategory.new( rest, secondary )
70
+ end
57
71
  else
58
72
  word = rest.downcase.gsub( /\s*\(1\)$/, '' )
59
73
  @tree.insert( word, category )
data/lib/lemmatizer.rb CHANGED
@@ -77,7 +77,7 @@ class Lemmatizer
77
77
 
78
78
  def self.parse_lematized_xml(doc)
79
79
 
80
- text = []
80
+ text = Text.new
81
81
 
82
82
  doc.elements.each("*/chunkList/chunk") do |chunk|
83
83
  sentence = Sentence.new
data/lib/liwc_analyzer.rb CHANGED
@@ -1,8 +1,94 @@
1
1
  module NLP
2
- class LIWC_Analyzer < Analyzer
2
+ class LIWCAnalyzer < Analyzer
3
+
4
+ def initialize( category_file, restore = true )
5
+ state_file = File.expand_path(Analyzer::CACHE_DIR+'.liwc')
6
+ if restore
7
+ @dictionary = Dictionary.restore(state_file)
8
+ else
9
+ @dictionary = Dictionary.new
10
+ @dictionary.load_categories( category_file, :rid => false )
11
+ @dictionary.store(state_file)
12
+ end
13
+
14
+ end
15
+
16
+
3
17
 
4
- def analyze
18
+ def analyze(scanner)
19
+
20
+ results = {
21
+ :word_count => 0,
22
+ :word_total => 0,
23
+ :scores => Hash.new { 0 },
24
+ :words => [],
25
+ :cwords => Hash.new { nil },
26
+ :long_words => [],
27
+ :zaimki => [],
28
+ :zaimki1 => [],
29
+ :zaimki2 => [],
30
+ :zaimki3 => [],
31
+ :przyimki => [],
32
+ :numbers => [],
33
+ :emotion => [],
34
+ :social => [],
35
+ :personal => [],
36
+ :posemotion => [],
37
+ :negemotion => [],
38
+ :wulgar => [],
39
+ :cognitive => []
40
+
41
+ }
42
+
43
+ while token = scanner.current
44
+ word = token.lemat
45
+
46
+ categories = @dictionary.find( word.gsub( /[^\w-]/, "" ) )
47
+ unless categories.nil?
48
+ categories.each do |category|
49
+ puts "Znalazłem słowo #{word} : #{category} root: #{category.root}"
50
+ token.category = category
51
+ results[:scores][category] = results[:scores][category] + 1
52
+
53
+
54
+ if results[:cwords][category.name].nil?
55
+ results[:cwords][category.name] = []
56
+ end
57
+ results[:cwords][category.name].push token.orth
58
+
59
+
60
+ results[:emotion].push token.orth if token.emotion?
61
+ results[:social].push token.orth if token.social?
62
+ results[:personal].push token.orth if token.personal?
63
+ results[:wulgar].push token.orth if token.bad_word?
64
+ results[:cognitive].push token.orth if token.cognitive?
65
+
66
+ results[:posemotion].push token.orth if token.positive_emotion?
67
+ results[:negemotion].push token.orth if token.negative_emotion?
68
+ results[:word_count] += 1
69
+ results[:words].push word
70
+ end
71
+ end
72
+
73
+ #words longer than 9
74
+ results[:long_words].push word if word.jlength > 9
75
+ if token.zaimek?
76
+ results[:zaimki].push word
77
+
78
+ results[:zaimki1].push token.orth if word === 'ja' or word === 'my'
79
+ results[:zaimki2].push token.orth if word === 'ty' or word === 'wy'
80
+ results[:zaimki3].push token.orth if word === 'on'
81
+ end
5
82
 
83
+ results[:przyimki].push word if token.przyimek?
84
+ results[:numbers].push token.orth if token.number? or token.liczebnik?
85
+
86
+
87
+ results[:word_total] += 1
88
+ scanner.next(:alphanum)
89
+ end
90
+ results
91
+
6
92
  end
7
93
 
8
94
 
data/lib/liwc_category.rb CHANGED
@@ -1,7 +1,62 @@
1
1
  module NLP
2
2
 
3
3
  class LIWCCategory < Category
4
-
4
+
5
+ #primary categories
6
+
7
+ def linguistic?
8
+ root == :PIERWOTNE
9
+ end
10
+
11
+ def psychological?
12
+ root == :PROCESY_PSYCHOLOGICZNE
13
+ end
14
+
15
+
16
+ def relative?
17
+ root === :RELATYWNOSC
18
+ end
19
+
20
+ def personal?
21
+ root == :OSOBISTE
22
+ end
23
+
24
+ #second categories
25
+
26
+ def emotion?
27
+ path.include? 'EMOCJE'
28
+
29
+ end
30
+
31
+ def positive_emotion?
32
+ path.include? 'POZYTYWNE_EMOCJE'
33
+
34
+ end
35
+
36
+ def negative_emotion?
37
+ path.include? 'NEGATYWNE_EMOCJE'
38
+
39
+ end
40
+
41
+ def cognitive?
42
+ path.include? 'KOGNITYWNE_PROCESY'
43
+
44
+ end
45
+
46
+ def sense?
47
+ path.include? 'ZMYSLY'
48
+ end
49
+
50
+ def social?
51
+ path.include? 'SOCIAL'
52
+
53
+ end
54
+
55
+ def bad_word?
56
+ path.include? 'WULGAR'
57
+ end
58
+
59
+
5
60
 
6
61
  end
7
62
  end
data/lib/meaningable.rb CHANGED
@@ -1,44 +1,65 @@
1
1
  module Meaningable
2
2
 
3
+
4
+
3
5
  #LIWC
4
- def positive_emotion?
6
+ #primary categories
7
+
8
+ def linguistic?
9
+ category.root == :PIERWOTNE
10
+ end
11
+
12
+ def psychological?
13
+ category.root == :PROCESY_PSYCHOLOGICZNE
14
+ end
5
15
 
6
- end
16
+
17
+ def relative?
18
+ category.root === :RELATYWNOSC
19
+ end
20
+
21
+ def personal?
22
+ category.root == :OSOBISTE
23
+ end
7
24
 
8
- def negative_emotion?
9
-
10
- end
25
+ #second categories
26
+
27
+ def emotion?
28
+ category.path.include? 'EMOCJE'
11
29
 
12
- def emotion?
30
+ end
13
31
 
14
- end
32
+ def positive_emotion?
33
+ category.path.include? 'POZYTYWNE_EMOCJE'
34
+
35
+ end
15
36
 
16
- def cognitive?
17
- end
37
+ def negative_emotion?
38
+ category.path.include? 'NEGATYWNE_EMOCJE'
18
39
 
19
- def social?
20
-
21
- end
22
-
23
- #EXPERIMENTAl
40
+ end
24
41
 
25
- def bad_word?
42
+ def cognitive?
43
+ category.path.include? 'KOGNITYWNE_PROCESY'
26
44
 
27
- end
45
+ end
28
46
 
47
+ def sense?
48
+ category.path.include? 'ZMYSLY'
49
+ end
29
50
 
30
- def emoticon?
51
+ def social?
52
+ category.path.include? 'SOCIAL'
31
53
 
32
- end
54
+ end
55
+
56
+ def bad_word?
57
+ category.path.include? 'WULGAR'
58
+ end
33
59
 
34
60
 
35
- def filler?
36
61
 
37
- end
38
62
 
39
- def nonfluent?
40
-
41
- end
42
63
 
43
64
  #SEMANTIC
44
65
  def synonym?(other)
data/lib/rid_analyzer.rb CHANGED
@@ -1,12 +1,27 @@
1
1
  module NLP
2
2
  class RIDAnalyzer < NLP::Analyzer
3
3
 
4
+
5
+ def initialize( category_file, restore = true )
6
+ state_file = File.expand_path(Analyzer::CACHE_DIR+'.rid')
7
+ if restore
8
+ @dictionary = Dictionary.restore(state_file)
9
+ else
10
+ @dictionary = Dictionary.new
11
+ @dictionary.load_categories( category_file, :rid => true )
12
+ @dictionary.store(state_file)
13
+ end
14
+
15
+ end
16
+
17
+
4
18
  def analyze(scanner)
5
19
  results = {
6
20
  :word_count => 0,
7
21
  :word_total => 0,
8
22
  :scores => Hash.new { 0 },
9
- :words => []
23
+ :words => [],
24
+ :cwords => Hash.new { nil }
10
25
  }
11
26
 
12
27
  while token = scanner.current
@@ -15,8 +30,13 @@ module NLP
15
30
  categories = @dictionary.find( word.gsub( /[^\w-]/, "" ) )
16
31
  unless categories.nil?
17
32
  categories.each do |category|
18
- puts "Znalazłem słowo #{word} : #{category}"
33
+ puts "Znalazłem słowo #{word} : #{category} root: #{category.root}"
19
34
  results[:scores][category] = results[:scores][category] + 1
35
+ category = category.name
36
+ if results[:cwords][category].nil?
37
+ results[:cwords][category] = []
38
+ end
39
+ results[:cwords][category].push word
20
40
  results[:word_count] += 1
21
41
  results[:words].push word
22
42
  end
@@ -46,6 +66,8 @@ module NLP
46
66
 
47
67
 
48
68
 
69
+
70
+
49
71
  end
50
72
 
51
73
 
data/lib/sentence.rb CHANGED
@@ -8,5 +8,9 @@ module NLP
8
8
  def << tokens
9
9
  @tokens.concat tokens
10
10
  end
11
+
12
+ def words_number
13
+ @tokens.size
14
+ end
11
15
  end
12
16
  end
data/lib/text.rb ADDED
@@ -0,0 +1,25 @@
1
+ module NLP
2
+ class Text
3
+ attr_reader :sentences
4
+
5
+ def initialize
6
+ @sentences = []
7
+ end
8
+
9
+ def << sentence
10
+ @sentences.push sentence
11
+ end
12
+
13
+
14
+ def words_per_sentence
15
+ mean(@sentences.collect{|s| s.words_number})
16
+ end
17
+
18
+ private
19
+ def mean(x)
20
+ sum=0
21
+ x.each{|v| sum+=v }
22
+ sum/x.size
23
+ end
24
+ end
25
+ end