morfologik 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/morfologik.rb
ADDED
@@ -0,0 +1,106 @@
|
|
1
|
+
#encoding:utf-8
|
2
|
+
|
3
|
+
require 'pathname'
|
4
|
+
require 'open3'
|
5
|
+
require 'awesome_print'
|
6
|
+
|
7
|
+
require 'morfologik/output_parser'
|
8
|
+
|
9
|
+
class Morfologik
|
10
|
+
|
11
|
+
attr_reader :jar
|
12
|
+
|
13
|
+
def initialize(options={})
|
14
|
+
@jar = options[:jar] || default_jar
|
15
|
+
raise LoadError, "Morfologik .jar file not found" unless jar_file_exists?
|
16
|
+
|
17
|
+
@output_parser = OutputParser.new
|
18
|
+
@ie = options[:input_encoding] || 'UTF-8'
|
19
|
+
@oe = options[:output_encoding] || 'UTF-8'
|
20
|
+
end
|
21
|
+
|
22
|
+
# Stems words giving their stems, categories and tags.
|
23
|
+
#
|
24
|
+
# @param [String, Array<String>] words words to stem
|
25
|
+
# @return [Hash] analysis for each recognized word
|
26
|
+
# @example
|
27
|
+
# Morfologik.new.stem("ma")
|
28
|
+
# # => {
|
29
|
+
# "ma" => [
|
30
|
+
# {
|
31
|
+
# :stem => "mieć",
|
32
|
+
# :category => "verb",
|
33
|
+
# :values => [
|
34
|
+
# { "tense" => "fin", "number" => "sg", "person" => "ter", "aspect" => "imperf" }
|
35
|
+
# ]
|
36
|
+
# },
|
37
|
+
# {
|
38
|
+
# :stem => "mój",
|
39
|
+
# :category => "adj",
|
40
|
+
# :values => [
|
41
|
+
# { "number" => "sg", "case" => "nom", "gender" => "f", "degree" => "pos" },
|
42
|
+
# { "number" => "sg", "case" => "voc", "gender" => "f", "degree" => "pos" }
|
43
|
+
# ]
|
44
|
+
# }
|
45
|
+
# }
|
46
|
+
def stem(words)
|
47
|
+
output = run_jar(words.kind_of?(String) ? words.split : words)
|
48
|
+
@output_parser.parse(output)
|
49
|
+
end
|
50
|
+
|
51
|
+
alias_method :lemmatize, :stem
|
52
|
+
|
53
|
+
# Stems words giving only their stems.
|
54
|
+
#
|
55
|
+
# @param (see #stem)
|
56
|
+
# @return [Hash] stems for each recognized word
|
57
|
+
# @example
|
58
|
+
# # => { "ma" => [ "mieć", "mój" ] }
|
59
|
+
def stem_simple(words)
|
60
|
+
output = run_jar(words.kind_of?(String) ? words.split : words)
|
61
|
+
@output_parser.parse_stems_only(output)
|
62
|
+
end
|
63
|
+
|
64
|
+
alias_method :lemmatize_simple, :stem_simple
|
65
|
+
|
66
|
+
def categories(words)
|
67
|
+
output = run_jar(words.kind_of?(String) ? words.split : words)
|
68
|
+
@output_parser.parse_categories_only(output)
|
69
|
+
end
|
70
|
+
|
71
|
+
# Checks if given words have at least one common stem. Returns nil if analysis fail.
|
72
|
+
#
|
73
|
+
# @param [String] *words words to check
|
74
|
+
# @return [true, false, nil]
|
75
|
+
def equal_stems?(*words)
|
76
|
+
return nil if words.uniq.size < 2
|
77
|
+
|
78
|
+
stems = @output_parser.parse_stems_only(run_jar(words))
|
79
|
+
return nil unless words.uniq.size == stems.keys.size
|
80
|
+
|
81
|
+
not stems.values.inject(&:&).empty?
|
82
|
+
end
|
83
|
+
|
84
|
+
private
|
85
|
+
|
86
|
+
def run_jar(words)
|
87
|
+
cmd = "echo '#{words.uniq.join(' ')}' | java -jar #{@jar} plstem -ie #{@ie} -oe #{@oe}"
|
88
|
+
|
89
|
+
result = []
|
90
|
+
Open3.popen3(cmd) do |i, o, e, t|
|
91
|
+
o.each_line { |line| result << line unless line.start_with?('Processed') }
|
92
|
+
end
|
93
|
+
|
94
|
+
return result
|
95
|
+
end
|
96
|
+
|
97
|
+
def jar_file_exists?
|
98
|
+
File.exists?(@jar) and File.extname(@jar) == '.jar'
|
99
|
+
end
|
100
|
+
|
101
|
+
def default_jar
|
102
|
+
path = File.dirname(__FILE__) + '/morfologik/jar/morfologik-tools-1.5.2-standalone.jar'
|
103
|
+
Pathname.new(path).realpath.cleanpath.to_s
|
104
|
+
end
|
105
|
+
|
106
|
+
end
|
Binary file
|
@@ -0,0 +1,61 @@
|
|
1
|
+
require 'morfologik/tagset_parser'
|
2
|
+
|
3
|
+
class Morfologik
|
4
|
+
class OutputParser
|
5
|
+
|
6
|
+
def initialize
|
7
|
+
@tagset_parser = TagsetParser.new
|
8
|
+
end
|
9
|
+
|
10
|
+
def parse(output)
|
11
|
+
output.inject({}) do |result, line|
|
12
|
+
word, stem, desc = line.split
|
13
|
+
|
14
|
+
if stem_found?(stem)
|
15
|
+
desc.split('+').each do |tags|
|
16
|
+
category, values = @tagset_parser.parse(tags)
|
17
|
+
|
18
|
+
morf = { :stem => stem, :category => category, :values => values }
|
19
|
+
result.has_key?(word) ? result[word] << morf : result[word] = [morf]
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
result
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def parse_stems_only(output)
|
28
|
+
output.inject({}) do |result, line|
|
29
|
+
word, stem = line.split[0..1]
|
30
|
+
|
31
|
+
if stem_found?(stem)
|
32
|
+
result.has_key?(word) ? result[word] << stem : result[word] = [stem]
|
33
|
+
result[word].uniq!
|
34
|
+
end
|
35
|
+
|
36
|
+
result
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
def parse_categories_only(output)
|
41
|
+
output.inject({}) do |result, line|
|
42
|
+
word, stem, tags = line.split
|
43
|
+
|
44
|
+
if stem_found?(stem)
|
45
|
+
category = tags.split(':').first
|
46
|
+
result.has_key?(word) ? result[word] << category : result[word] = [category]
|
47
|
+
result[word].uniq!
|
48
|
+
end
|
49
|
+
|
50
|
+
result
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
private
|
55
|
+
|
56
|
+
def stem_found?(stem)
|
57
|
+
stem != '-'
|
58
|
+
end
|
59
|
+
|
60
|
+
end
|
61
|
+
end
|
@@ -0,0 +1,123 @@
|
|
1
|
+
class Morfologik
|
2
|
+
class TagsetParser
|
3
|
+
|
4
|
+
TAGS = {
|
5
|
+
"adj" => "pos",
|
6
|
+
"adja" => "pos",
|
7
|
+
"adjp" => "pos",
|
8
|
+
"adv" => "pos",
|
9
|
+
"num" => "pos",
|
10
|
+
"ppron12" => "pos",
|
11
|
+
"ppron3" => "pos",
|
12
|
+
"pred" => "pos",
|
13
|
+
"prep" => "pos",
|
14
|
+
"siebie" => "pos",
|
15
|
+
"subst" => "pos",
|
16
|
+
"verb" => "pos",
|
17
|
+
"conj" => "pos",
|
18
|
+
"qub" => "pos",
|
19
|
+
"burk" => "pos", # bound word
|
20
|
+
"interj" => "pos", # interjection
|
21
|
+
"interp" => "pos", # interpunction
|
22
|
+
"xxx" => "pos", # alien
|
23
|
+
"brev" => "pos", # abbreviation
|
24
|
+
"nie" => "pos",
|
25
|
+
"ign" => "pos",
|
26
|
+
"sg" => "number",
|
27
|
+
"pl" => "number",
|
28
|
+
"pltant" => "number",
|
29
|
+
"nom" => "case",
|
30
|
+
"gen" => "case",
|
31
|
+
"acc" => "case",
|
32
|
+
"dat" => "case",
|
33
|
+
"inst" => "case",
|
34
|
+
"loc" => "case",
|
35
|
+
"voc" => "case",
|
36
|
+
"pos" => "degree",
|
37
|
+
"comp" => "degree",
|
38
|
+
"sup" => "degree",
|
39
|
+
"m" => "gender",
|
40
|
+
"m1" => "gender",
|
41
|
+
"m2" => "gender",
|
42
|
+
"m3" => "gender",
|
43
|
+
"m4" => "gender",
|
44
|
+
"n" => "gender",
|
45
|
+
"f" => "gender",
|
46
|
+
"n1" => "gender",
|
47
|
+
"n2" => "gender",
|
48
|
+
"p1" => "gender",
|
49
|
+
"p2" => "gender",
|
50
|
+
"p3" => "gender",
|
51
|
+
"pri" => "person",
|
52
|
+
"sec" => "person",
|
53
|
+
"ter" => "person",
|
54
|
+
"depr" => "depreciativity",
|
55
|
+
"winien" => "winien",
|
56
|
+
"aff" => "negation",
|
57
|
+
"neg" => "negation",
|
58
|
+
"perf" => "aspect",
|
59
|
+
"imperf" => "aspect",
|
60
|
+
"?perf" => "aspect",
|
61
|
+
"nakc" => "accentability",
|
62
|
+
"akc" => "accentability",
|
63
|
+
"praep" => "post-prepositionality",
|
64
|
+
"npraep" => "post-prepositionality",
|
65
|
+
"ger" => "tense",
|
66
|
+
"imps" => "tense",
|
67
|
+
"inf" => "tense",
|
68
|
+
"fin" => "tense",
|
69
|
+
"bedzie" => "tense",
|
70
|
+
"praet" => "tense",
|
71
|
+
"refl" => "tense",
|
72
|
+
"pact" => "tense",
|
73
|
+
"pant" => "tense",
|
74
|
+
"pcon" => "tense",
|
75
|
+
"ppas" => "tense",
|
76
|
+
"impt" => "mode",
|
77
|
+
"pot" => "mode",
|
78
|
+
"indecl" => "uninflected",
|
79
|
+
"irreg" => "irregularity",
|
80
|
+
"pun" => "fullstoppedness",
|
81
|
+
"npun" => "fullstoppedness",
|
82
|
+
"wok" => "vocalicity",
|
83
|
+
"nwok" => "vocalicity",
|
84
|
+
"agl" => "agglutination",
|
85
|
+
"nagl" => "agglutination",
|
86
|
+
"_" => "unknown",
|
87
|
+
"congr" => "unknown",
|
88
|
+
"rec" => "unknown"
|
89
|
+
}
|
90
|
+
|
91
|
+
def parse(raw_tags)
|
92
|
+
tags = raw_tags.split(':')
|
93
|
+
|
94
|
+
category = find_part_of_speech(tags)
|
95
|
+
values = split_tags(tags)
|
96
|
+
|
97
|
+
return category, values
|
98
|
+
end
|
99
|
+
|
100
|
+
private
|
101
|
+
|
102
|
+
def find_part_of_speech(tags)
|
103
|
+
tags.shift
|
104
|
+
end
|
105
|
+
|
106
|
+
def split_tags(tags)
|
107
|
+
atom_tags = tags.map { |t| t.split('.') }
|
108
|
+
all = atom_tags.inject(1) { |c,t| c * t.size }
|
109
|
+
|
110
|
+
atom_tags.each_with_index do |tags, i|
|
111
|
+
atom_tags[i] += tags while atom_tags[i].size < all
|
112
|
+
end
|
113
|
+
|
114
|
+
result = []
|
115
|
+
all.times do |i|
|
116
|
+
result << atom_tags.map { |t| t[i] }.inject({}) { |hsh, t| hsh[TAGS[t]] = t; hsh }
|
117
|
+
end
|
118
|
+
|
119
|
+
return result
|
120
|
+
end
|
121
|
+
|
122
|
+
end
|
123
|
+
end
|
metadata
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: morfologik
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.3
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- snukky
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-09-22 00:00:00.000000000 Z
|
13
|
+
dependencies: []
|
14
|
+
description: Ruby MRI bindings for morfologik-stemming library (Polish morphological
|
15
|
+
analyzer) written in Java.
|
16
|
+
email:
|
17
|
+
- snk987@gmail.com
|
18
|
+
executables: []
|
19
|
+
extensions: []
|
20
|
+
extra_rdoc_files: []
|
21
|
+
files:
|
22
|
+
- lib/morfologik.rb
|
23
|
+
- lib/morfologik/output_parser.rb
|
24
|
+
- lib/morfologik/tagset_parser.rb
|
25
|
+
- lib/morfologik/jar/morfologik-tools-1.5.2-standalone.jar
|
26
|
+
homepage: http://github.com/snukky/morfologik
|
27
|
+
licenses: []
|
28
|
+
post_install_message:
|
29
|
+
rdoc_options: []
|
30
|
+
require_paths:
|
31
|
+
- lib
|
32
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
40
|
+
requirements:
|
41
|
+
- - ! '>='
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: '0'
|
44
|
+
requirements: []
|
45
|
+
rubyforge_project:
|
46
|
+
rubygems_version: 1.8.10
|
47
|
+
signing_key:
|
48
|
+
specification_version: 3
|
49
|
+
summary: Ruby bindings for Morfologik.
|
50
|
+
test_files: []
|