nlp 0.2.6 → 0.2.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,85 +0,0 @@
1
-
2
- require 'stree'
3
- require 'category'
4
- require 'rid_category'
5
- require 'liwc_category'
6
-
7
- module NLP
8
-
9
- class Dictionary
10
- attr_accessor :tree
11
-
12
-
13
- def initialize(category_file=:rid,restore = true)
14
- state_file = File.expand_path(DICTIONARY_CACHE_DIR+".#{category_file.to_s}")
15
- if restore and File.exist?(state_file)
16
- d = Dictionary.restore(state_file)
17
- @tree = d.tree
18
- else
19
- @tree = SearchTree.new
20
- load_categories(File.dirname(__FILE__)+"/../dict/#{category_file.to_s}", category_file )
21
- store(state_file)
22
- end
23
-
24
- end
25
-
26
- def store( state_file )
27
- File.open( File.expand_path( state_file ), "w" ) do |file|
28
- Marshal.dump( self, file )
29
- end
30
- self
31
- end
32
-
33
- def self.restore( state_file )
34
- File.open( File.expand_path( state_file ) ) do |file|
35
- Marshal.restore( file )
36
- end
37
- end
38
-
39
- def find(word)
40
- if @exception_pattern && @exception_pattern =~ word
41
- nil
42
- else
43
- @tree.find(word)
44
- end
45
- end
46
-
47
-
48
- def load_categories(category_file,type)
49
- category = nil
50
- primary = nil
51
- secondary = nil
52
- tertiary = nil
53
-
54
- if type == :rid
55
- cat_class = NLP.const_get("RIDCategory")
56
- else
57
- cat_class = NLP.const_get("LIWCCategory")
58
- end
59
-
60
- File.open( category_file ) do |file|
61
- while line = file.gets
62
- line.chomp!
63
- begin
64
- lead, rest = line.scan( /(\t*)(.*)/ ).first
65
- if lead.size == 0
66
- category = primary = cat_class.new(rest)
67
- secondary, tertiary = nil
68
- elsif lead.size == 1
69
- category = secondary = cat_class.new(rest, primary )
70
- tertiary = nil
71
- elsif lead.size == 2 && ( cat = line.strip.index(/^[A-ZĄŚĘĆŃŹŻŁÓ_]+$/)) && cat >= 0
72
- category = tertiary = cat_class.new( rest, secondary )
73
- else
74
- word = rest.downcase.gsub( /\s*\(1\)$/, '' )
75
- @tree.insert( word, category )
76
- end
77
- rescue
78
- raise
79
- end
80
- end
81
- end
82
- end
83
- end
84
- end
85
-
@@ -1,14 +0,0 @@
1
- require 'meaningable'
2
-
3
- module NLP
4
- class Emoticon < Token
5
- include Meaningable
6
-
7
- def initialize(tokens,tags)
8
- @orth = tokens.join("")
9
- @tags = 'emoticon'
10
- end
11
-
12
- end
13
- end
14
-
@@ -1,60 +0,0 @@
1
- module Inflectable
2
-
3
- GRAM_CAT = {
4
- #rzeczownik
5
- :adj => 'przymiotnik',
6
- [:subst,:depr] => 'rzeczownik',
7
- :adv => 'przyslowek',
8
- :num => 'liczebnik',
9
- [:pron,:siebie] => 'zaimek',
10
- :prep => 'przyimek',
11
- #liczby
12
- :sg => 'liczba_pojedyncza',
13
- :pl => 'liczba_mnoga',
14
-
15
- #Przypadki
16
- :nom => 'mianownik',
17
- :gen => 'dopelniacz',
18
- :dat => 'celownik',
19
- :acc => 'biernik',
20
- :inst => 'narzednik',
21
- :loc => 'miejscownik',
22
- :voc => 'wolacz',
23
-
24
- #Rodzaje
25
- :m1 => 'meski_osobowy',
26
- :m2 => 'meski_zwierzecy',
27
- :m3 => 'meski_rzeczowy',
28
- :f => 'zenski',
29
- :n1 => 'nijaki_zbiorowy',
30
- :n2 => 'nijaki zwykly',
31
- :p1 => 'przymnogi_osobowy',
32
- :p2 => 'przymnogi_zwykly',
33
- :p3 => 'przymnogi_opisowy',
34
-
35
- #Osoby
36
- :pri => "pierwsza_osoba",
37
- :sec => "druga_osoba",
38
- :ter => "trzecia_osoba",
39
-
40
- #Stopień
41
- :pos => "stopien_rowny",
42
- :comp => "stopien_wyzszy",
43
- :sup => "stopien_najwyzszy"
44
- }
45
-
46
- GRAM_CAT.each do |key,value|
47
-
48
- define_method(value+"?"){
49
- inflection.split(":").any?{|e|
50
- if key.is_a? Array
51
- key.any?{|k| e.include? k.to_s}
52
- else
53
- e.include? key.to_s
54
- end
55
- }
56
- }
57
- end
58
-
59
-
60
- end
@@ -1,112 +0,0 @@
1
- require 'takipi_web_service'
2
- require 'rexml/document'
3
- require 'morfeusz'
4
-
5
- module NLP
6
- class Lemmatizer
7
-
8
- include REXML
9
-
10
- def self.lemmatize(text,method=nil,input_type=nil)
11
- if text.is_a? File
12
- str = text.read
13
- text.close
14
- elsif text.is_a? String
15
- str = text
16
- else
17
- raise ArgumentError, "Argument is not String or File"
18
- end
19
-
20
- if method === :takipi
21
- takipi_lemmatize(str,input_type)
22
-
23
- #Default lematization method is Morfeusz
24
- else
25
- morfeusz_lemmatize(str)
26
- end
27
- end
28
-
29
-
30
-
31
- def self.takipi_lemmatize(text,method)
32
-
33
- if method === :local
34
-
35
- xml_file = TAKIPI_XML_FILE
36
-
37
- t1 = Thread.new do
38
- `echo '#{text}' > /tmp/text.txt; takipi -i /tmp/text.txt -o #{xml_file} -it TXT`
39
- end
40
-
41
- t1.join
42
-
43
- f = File.open(xml_file,"r")
44
- doc = Document.new f
45
-
46
- elsif method === :remote
47
- xml = TakipiWebService.request(text)
48
- doc = Document.new xml
49
- else
50
- raise ArgumentError, 'Argument is not :local or :remote'
51
- end
52
-
53
- parse_lemmatized_xml(doc)
54
- end
55
-
56
-
57
- def self.morfeusz_lemmatize(text)
58
- temp_text = Text.new
59
-
60
- #simple tagger
61
- #TODO lemmatizer should take TokenScanner object that defines
62
- #how split string
63
- text.split(/\.|!|\?/).each do |s|
64
- sentence = Sentence.new
65
- sentence << s.split(" ").collect{ |t|
66
- if word = Morfeusz::Lexeme.find(t)
67
- if word[0]
68
- Word.new(t,word[0].base_form,"")
69
- else
70
- Word.new(t,"","")
71
- end
72
- else
73
- Word.new(t,"","")
74
- end
75
- }
76
- temp_text << sentence
77
- end
78
- temp_text
79
- end
80
-
81
-
82
- def self.parse_lemmatized_xml(doc)
83
-
84
- text = Text.new
85
-
86
- doc.elements.each("*/chunkList/chunk") do |chunk|
87
- sentence = Sentence.new
88
- tokens = []
89
-
90
- chunk.elements.each("tok") do |tok|
91
- word = tok.elements[1].text
92
- lemat, inflect = ""
93
-
94
- tok.elements.each("lex") do |lex|
95
- if lex.has_attributes?
96
- lemat = lex.elements[1].text
97
- inflect = lex.elements[2].text
98
- end
99
- end
100
-
101
- tokens << Word.new(word,lemat,inflect)
102
- end
103
-
104
- sentence << tokens
105
- text << sentence
106
- end
107
- text
108
- end
109
-
110
-
111
- end
112
- end
@@ -1,74 +0,0 @@
1
- module NLP
2
- class LIWCAnalyzer < Analyzer
3
-
4
- def initialize(dicts)
5
- @dictionary = Dictionary.new(:liwc)
6
- end
7
-
8
-
9
- def analyze(scanner)
10
-
11
- results = Statistic.new
12
- results.hash = {
13
- :long_words => [],
14
- :zaimki => [],
15
- :zaimki1 => [],
16
- :zaimki2 => [],
17
- :zaimki3 => [],
18
- :przyimki => [],
19
- :numbers => [],
20
- :emotion => [],
21
- :social => [],
22
- :personal => [],
23
- :posemotion => [],
24
- :negemotion => [],
25
- :wulgar => [],
26
- :cognitive => []
27
- }
28
-
29
- while token = scanner.current
30
- word = token.lemat
31
-
32
- categories = @dictionary.find(word.gsub( /[^\w-]/, "" ))
33
- unless categories.nil?
34
- categories.each do |category|
35
- puts "Znalazłem słowo #{word} : #{category} root: #{category.root}"
36
- token.category = category
37
- results.add(word,category)
38
-
39
-
40
- results[:emotion].push token.orth if token.emotion?
41
- results[:social].push token.orth if token.social?
42
- results[:personal].push token.orth if token.personal?
43
- results[:wulgar].push token.orth if token.bad_word?
44
- results[:cognitive].push token.orth if token.cognitive?
45
-
46
- results[:posemotion].push token.orth if token.positive_emotion?
47
- results[:negemotion].push token.orth if token.negative_emotion?
48
- end
49
- end
50
-
51
- #words longer than 10
52
- results[:long_words].push word if word.jlength > 10
53
- if token.zaimek?
54
- results[:zaimki].push word
55
-
56
- results[:zaimki1].push token.orth if word === 'ja' or word === 'my'
57
- results[:zaimki2].push token.orth if word === 'ty' or word === 'wy'
58
- results[:zaimki3].push token.orth if word === 'on'
59
- end
60
-
61
- results[:przyimki].push word if token.przyimek?
62
- results[:numbers].push token.orth if token.number? or token.liczebnik?
63
-
64
-
65
- results.total_words += 1
66
- scanner.next(:alphanum)
67
- end
68
- results
69
-
70
- end
71
-
72
- end
73
-
74
- end
@@ -1,61 +0,0 @@
1
- module NLP
2
-
3
- class LIWCCategory < Category
4
-
5
- #primary categories
6
-
7
- def linguistic?
8
- root == :PIERWOTNE
9
- end
10
-
11
- def psychological?
12
- root == :PROCESY_PSYCHOLOGICZNE
13
- end
14
-
15
-
16
- def relative?
17
- root === :RELATYWNOSC
18
- end
19
-
20
- def personal?
21
- root == :OSOBISTE
22
- end
23
-
24
- #second categories
25
-
26
- def emotion?
27
- path.include? 'EMOCJE'
28
-
29
- end
30
-
31
- def positive_emotion?
32
- path.include? 'POZYTYWNE_EMOCJE'
33
-
34
- end
35
-
36
- def negative_emotion?
37
- path.include? 'NEGATYWNE_EMOCJE'
38
-
39
- end
40
-
41
- def cognitive?
42
- path.include? 'KOGNITYWNE_PROCESY'
43
-
44
- end
45
-
46
- def sense?
47
- path.include? 'ZMYSLY'
48
- end
49
-
50
- def social?
51
- path.include? 'SOCIAL'
52
-
53
- end
54
-
55
- def bad_word?
56
- path.include? 'WULGAR'
57
- end
58
-
59
-
60
- end
61
- end
@@ -1,69 +0,0 @@
1
- module Meaningable
2
-
3
- #LIWC
4
- #primary categories
5
-
6
- def linguistic?
7
- category.root == :PIERWOTNE
8
- end
9
-
10
- def psychological?
11
- category.root == :PROCESY_PSYCHOLOGICZNE
12
- end
13
-
14
-
15
- def relative?
16
- category.root === :RELATYWNOSC
17
- end
18
-
19
- def personal?
20
- category.root == :OSOBISTE
21
- end
22
-
23
- #second categories
24
-
25
- def emotion?
26
- category.path.include? 'EMOCJE'
27
-
28
- end
29
-
30
- def positive_emotion?
31
- category.path.include? 'POZYTYWNE_EMOCJE'
32
-
33
- end
34
-
35
- def negative_emotion?
36
- category.path.include? 'NEGATYWNE_EMOCJE'
37
-
38
- end
39
-
40
- def cognitive?
41
- category.path.include? 'KOGNITYWNE_PROCESY'
42
-
43
- end
44
-
45
- def sense?
46
- category.path.include? 'ZMYSLY'
47
- end
48
-
49
- def social?
50
- category.path.include? 'SOCIAL'
51
-
52
- end
53
-
54
- def bad_word?
55
- category.path.include? 'WULGAR'
56
- end
57
-
58
-
59
- #SEMANTIC
60
- def synonym?(other)
61
-
62
- end
63
-
64
- def synonyms
65
-
66
- end
67
-
68
-
69
- end