morfologik 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
data/lib/morfologik.rb
ADDED
@@ -0,0 +1,106 @@
|
|
1
|
+
#encoding:utf-8
|
2
|
+
|
3
|
+
require 'pathname'
|
4
|
+
require 'open3'
|
5
|
+
require 'awesome_print'
|
6
|
+
|
7
|
+
require 'morfologik/output_parser'
|
8
|
+
|
9
|
+
class Morfologik
|
10
|
+
|
11
|
+
attr_reader :jar
|
12
|
+
|
13
|
+
def initialize(options={})
|
14
|
+
@jar = options[:jar] || default_jar
|
15
|
+
raise LoadError, "Morfologik .jar file not found" unless jar_file_exists?
|
16
|
+
|
17
|
+
@output_parser = OutputParser.new
|
18
|
+
@ie = options[:input_encoding] || 'UTF-8'
|
19
|
+
@oe = options[:output_encoding] || 'UTF-8'
|
20
|
+
end
|
21
|
+
|
22
|
+
# Stems words giving their stems, categories and tags.
|
23
|
+
#
|
24
|
+
# @param [String, Array<String>] words words to stem
|
25
|
+
# @return [Hash] analysis for each recognized word
|
26
|
+
# @example
|
27
|
+
# Morfologik.new.stem("ma")
|
28
|
+
# # => {
|
29
|
+
# "ma" => [
|
30
|
+
# {
|
31
|
+
# :stem => "mieć",
|
32
|
+
# :category => "verb",
|
33
|
+
# :values => [
|
34
|
+
# { "tense" => "fin", "number" => "sg", "person" => "ter", "aspect" => "imperf" }
|
35
|
+
# ]
|
36
|
+
# },
|
37
|
+
# {
|
38
|
+
# :stem => "mój",
|
39
|
+
# :category => "adj",
|
40
|
+
# :values => [
|
41
|
+
# { "number" => "sg", "case" => "nom", "gender" => "f", "degree" => "pos" },
|
42
|
+
# { "number" => "sg", "case" => "voc", "gender" => "f", "degree" => "pos" }
|
43
|
+
# ]
|
44
|
+
# }
|
45
|
+
# }
|
46
|
+
def stem(words)
|
47
|
+
output = run_jar(words.kind_of?(String) ? words.split : words)
|
48
|
+
@output_parser.parse(output)
|
49
|
+
end
|
50
|
+
|
51
|
+
alias_method :lemmatize, :stem
|
52
|
+
|
53
|
+
# Stems words giving only their stems.
|
54
|
+
#
|
55
|
+
# @param (see #stem)
|
56
|
+
# @return [Hash] stems for each recognized word
|
57
|
+
# @example
|
58
|
+
# # => { "ma" => [ "mieć", "mój" ] }
|
59
|
+
def stem_simple(words)
|
60
|
+
output = run_jar(words.kind_of?(String) ? words.split : words)
|
61
|
+
@output_parser.parse_stems_only(output)
|
62
|
+
end
|
63
|
+
|
64
|
+
alias_method :lemmatize_simple, :stem_simple
|
65
|
+
|
66
|
+
def categories(words)
|
67
|
+
output = run_jar(words.kind_of?(String) ? words.split : words)
|
68
|
+
@output_parser.parse_categories_only(output)
|
69
|
+
end
|
70
|
+
|
71
|
+
# Checks if given words have at least one common stem. Returns nil if analysis fail.
|
72
|
+
#
|
73
|
+
# @param [String] *words words to check
|
74
|
+
# @return [true, false, nil]
|
75
|
+
def equal_stems?(*words)
|
76
|
+
return nil if words.uniq.size < 2
|
77
|
+
|
78
|
+
stems = @output_parser.parse_stems_only(run_jar(words))
|
79
|
+
return nil unless words.uniq.size == stems.keys.size
|
80
|
+
|
81
|
+
not stems.values.inject(&:&).empty?
|
82
|
+
end
|
83
|
+
|
84
|
+
private
|
85
|
+
|
86
|
+
def run_jar(words)
|
87
|
+
cmd = "echo '#{words.uniq.join(' ')}' | java -jar #{@jar} plstem -ie #{@ie} -oe #{@oe}"
|
88
|
+
|
89
|
+
result = []
|
90
|
+
Open3.popen3(cmd) do |i, o, e, t|
|
91
|
+
o.each_line { |line| result << line unless line.start_with?('Processed') }
|
92
|
+
end
|
93
|
+
|
94
|
+
return result
|
95
|
+
end
|
96
|
+
|
97
|
+
def jar_file_exists?
|
98
|
+
File.exists?(@jar) and File.extname(@jar) == '.jar'
|
99
|
+
end
|
100
|
+
|
101
|
+
def default_jar
|
102
|
+
path = File.dirname(__FILE__) + '/morfologik/jar/morfologik-tools-1.5.2-standalone.jar'
|
103
|
+
Pathname.new(path).realpath.cleanpath.to_s
|
104
|
+
end
|
105
|
+
|
106
|
+
end
|
Binary file
|
@@ -0,0 +1,61 @@
|
|
1
|
+
require 'morfologik/tagset_parser'
|
2
|
+
|
3
|
+
class Morfologik
|
4
|
+
class OutputParser
|
5
|
+
|
6
|
+
def initialize
|
7
|
+
@tagset_parser = TagsetParser.new
|
8
|
+
end
|
9
|
+
|
10
|
+
def parse(output)
|
11
|
+
output.inject({}) do |result, line|
|
12
|
+
word, stem, desc = line.split
|
13
|
+
|
14
|
+
if stem_found?(stem)
|
15
|
+
desc.split('+').each do |tags|
|
16
|
+
category, values = @tagset_parser.parse(tags)
|
17
|
+
|
18
|
+
morf = { :stem => stem, :category => category, :values => values }
|
19
|
+
result.has_key?(word) ? result[word] << morf : result[word] = [morf]
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
result
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def parse_stems_only(output)
|
28
|
+
output.inject({}) do |result, line|
|
29
|
+
word, stem = line.split[0..1]
|
30
|
+
|
31
|
+
if stem_found?(stem)
|
32
|
+
result.has_key?(word) ? result[word] << stem : result[word] = [stem]
|
33
|
+
result[word].uniq!
|
34
|
+
end
|
35
|
+
|
36
|
+
result
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
def parse_categories_only(output)
|
41
|
+
output.inject({}) do |result, line|
|
42
|
+
word, stem, tags = line.split
|
43
|
+
|
44
|
+
if stem_found?(stem)
|
45
|
+
category = tags.split(':').first
|
46
|
+
result.has_key?(word) ? result[word] << category : result[word] = [category]
|
47
|
+
result[word].uniq!
|
48
|
+
end
|
49
|
+
|
50
|
+
result
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
private
|
55
|
+
|
56
|
+
def stem_found?(stem)
|
57
|
+
stem != '-'
|
58
|
+
end
|
59
|
+
|
60
|
+
end
|
61
|
+
end
|
@@ -0,0 +1,123 @@
|
|
1
|
+
class Morfologik
|
2
|
+
class TagsetParser
|
3
|
+
|
4
|
+
TAGS = {
|
5
|
+
"adj" => "pos",
|
6
|
+
"adja" => "pos",
|
7
|
+
"adjp" => "pos",
|
8
|
+
"adv" => "pos",
|
9
|
+
"num" => "pos",
|
10
|
+
"ppron12" => "pos",
|
11
|
+
"ppron3" => "pos",
|
12
|
+
"pred" => "pos",
|
13
|
+
"prep" => "pos",
|
14
|
+
"siebie" => "pos",
|
15
|
+
"subst" => "pos",
|
16
|
+
"verb" => "pos",
|
17
|
+
"conj" => "pos",
|
18
|
+
"qub" => "pos",
|
19
|
+
"burk" => "pos", # bound word
|
20
|
+
"interj" => "pos", # interjection
|
21
|
+
"interp" => "pos", # interpunction
|
22
|
+
"xxx" => "pos", # alien
|
23
|
+
"brev" => "pos", # abbreviation
|
24
|
+
"nie" => "pos",
|
25
|
+
"ign" => "pos",
|
26
|
+
"sg" => "number",
|
27
|
+
"pl" => "number",
|
28
|
+
"pltant" => "number",
|
29
|
+
"nom" => "case",
|
30
|
+
"gen" => "case",
|
31
|
+
"acc" => "case",
|
32
|
+
"dat" => "case",
|
33
|
+
"inst" => "case",
|
34
|
+
"loc" => "case",
|
35
|
+
"voc" => "case",
|
36
|
+
"pos" => "degree",
|
37
|
+
"comp" => "degree",
|
38
|
+
"sup" => "degree",
|
39
|
+
"m" => "gender",
|
40
|
+
"m1" => "gender",
|
41
|
+
"m2" => "gender",
|
42
|
+
"m3" => "gender",
|
43
|
+
"m4" => "gender",
|
44
|
+
"n" => "gender",
|
45
|
+
"f" => "gender",
|
46
|
+
"n1" => "gender",
|
47
|
+
"n2" => "gender",
|
48
|
+
"p1" => "gender",
|
49
|
+
"p2" => "gender",
|
50
|
+
"p3" => "gender",
|
51
|
+
"pri" => "person",
|
52
|
+
"sec" => "person",
|
53
|
+
"ter" => "person",
|
54
|
+
"depr" => "depreciativity",
|
55
|
+
"winien" => "winien",
|
56
|
+
"aff" => "negation",
|
57
|
+
"neg" => "negation",
|
58
|
+
"perf" => "aspect",
|
59
|
+
"imperf" => "aspect",
|
60
|
+
"?perf" => "aspect",
|
61
|
+
"nakc" => "accentability",
|
62
|
+
"akc" => "accentability",
|
63
|
+
"praep" => "post-prepositionality",
|
64
|
+
"npraep" => "post-prepositionality",
|
65
|
+
"ger" => "tense",
|
66
|
+
"imps" => "tense",
|
67
|
+
"inf" => "tense",
|
68
|
+
"fin" => "tense",
|
69
|
+
"bedzie" => "tense",
|
70
|
+
"praet" => "tense",
|
71
|
+
"refl" => "tense",
|
72
|
+
"pact" => "tense",
|
73
|
+
"pant" => "tense",
|
74
|
+
"pcon" => "tense",
|
75
|
+
"ppas" => "tense",
|
76
|
+
"impt" => "mode",
|
77
|
+
"pot" => "mode",
|
78
|
+
"indecl" => "uninflected",
|
79
|
+
"irreg" => "irregularity",
|
80
|
+
"pun" => "fullstoppedness",
|
81
|
+
"npun" => "fullstoppedness",
|
82
|
+
"wok" => "vocalicity",
|
83
|
+
"nwok" => "vocalicity",
|
84
|
+
"agl" => "agglutination",
|
85
|
+
"nagl" => "agglutination",
|
86
|
+
"_" => "unknown",
|
87
|
+
"congr" => "unknown",
|
88
|
+
"rec" => "unknown"
|
89
|
+
}
|
90
|
+
|
91
|
+
def parse(raw_tags)
|
92
|
+
tags = raw_tags.split(':')
|
93
|
+
|
94
|
+
category = find_part_of_speech(tags)
|
95
|
+
values = split_tags(tags)
|
96
|
+
|
97
|
+
return category, values
|
98
|
+
end
|
99
|
+
|
100
|
+
private
|
101
|
+
|
102
|
+
def find_part_of_speech(tags)
|
103
|
+
tags.shift
|
104
|
+
end
|
105
|
+
|
106
|
+
def split_tags(tags)
|
107
|
+
atom_tags = tags.map { |t| t.split('.') }
|
108
|
+
all = atom_tags.inject(1) { |c,t| c * t.size }
|
109
|
+
|
110
|
+
atom_tags.each_with_index do |tags, i|
|
111
|
+
atom_tags[i] += tags while atom_tags[i].size < all
|
112
|
+
end
|
113
|
+
|
114
|
+
result = []
|
115
|
+
all.times do |i|
|
116
|
+
result << atom_tags.map { |t| t[i] }.inject({}) { |hsh, t| hsh[TAGS[t]] = t; hsh }
|
117
|
+
end
|
118
|
+
|
119
|
+
return result
|
120
|
+
end
|
121
|
+
|
122
|
+
end
|
123
|
+
end
|
metadata
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: morfologik
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.3
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- snukky
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-09-22 00:00:00.000000000 Z
|
13
|
+
dependencies: []
|
14
|
+
description: Ruby MRI bindings for morfologik-stemming library (Polish morphological
|
15
|
+
analyzer) written in Java.
|
16
|
+
email:
|
17
|
+
- snk987@gmail.com
|
18
|
+
executables: []
|
19
|
+
extensions: []
|
20
|
+
extra_rdoc_files: []
|
21
|
+
files:
|
22
|
+
- lib/morfologik.rb
|
23
|
+
- lib/morfologik/output_parser.rb
|
24
|
+
- lib/morfologik/tagset_parser.rb
|
25
|
+
- lib/morfologik/jar/morfologik-tools-1.5.2-standalone.jar
|
26
|
+
homepage: http://github.com/snukky/morfologik
|
27
|
+
licenses: []
|
28
|
+
post_install_message:
|
29
|
+
rdoc_options: []
|
30
|
+
require_paths:
|
31
|
+
- lib
|
32
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
40
|
+
requirements:
|
41
|
+
- - ! '>='
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: '0'
|
44
|
+
requirements: []
|
45
|
+
rubyforge_project:
|
46
|
+
rubygems_version: 1.8.10
|
47
|
+
signing_key:
|
48
|
+
specification_version: 3
|
49
|
+
summary: Ruby bindings for Morfologik.
|
50
|
+
test_files: []
|