morfologik 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
data/lib/morfologik.rb ADDED
@@ -0,0 +1,106 @@
1
+ #encoding:utf-8
2
+
3
+ require 'pathname'
4
+ require 'open3'
5
+ require 'awesome_print'
6
+
7
+ require 'morfologik/output_parser'
8
+
9
+ class Morfologik
10
+
11
+ attr_reader :jar
12
+
13
+ def initialize(options={})
14
+ @jar = options[:jar] || default_jar
15
+ raise LoadError, "Morfologik .jar file not found" unless jar_file_exists?
16
+
17
+ @output_parser = OutputParser.new
18
+ @ie = options[:input_encoding] || 'UTF-8'
19
+ @oe = options[:output_encoding] || 'UTF-8'
20
+ end
21
+
22
+ # Stems words giving their stems, categories and tags.
23
+ #
24
+ # @param [String, Array<String>] words words to stem
25
+ # @return [Hash] analysis for each recognized word
26
+ # @example
27
+ # Morfologik.new.stem("ma")
28
+ # # => {
29
+ # "ma" => [
30
+ # {
31
+ # :stem => "mieć",
32
+ # :category => "verb",
33
+ # :values => [
34
+ # { "tense" => "fin", "number" => "sg", "person" => "ter", "aspect" => "imperf" }
35
+ # ]
36
+ # },
37
+ # {
38
+ # :stem => "mój",
39
+ # :category => "adj",
40
+ # :values => [
41
+ # { "number" => "sg", "case" => "nom", "gender" => "f", "degree" => "pos" },
42
+ # { "number" => "sg", "case" => "voc", "gender" => "f", "degree" => "pos" }
43
+ # ]
44
+ # }
45
+ # }
46
+ def stem(words)
47
+ output = run_jar(words.kind_of?(String) ? words.split : words)
48
+ @output_parser.parse(output)
49
+ end
50
+
51
+ alias_method :lemmatize, :stem
52
+
53
+ # Stems words giving only their stems.
54
+ #
55
+ # @param (see #stem)
56
+ # @return [Hash] stems for each recognized word
57
+ # @example
58
+ # # => { "ma" => [ "mieć", "mój" ] }
59
+ def stem_simple(words)
60
+ output = run_jar(words.kind_of?(String) ? words.split : words)
61
+ @output_parser.parse_stems_only(output)
62
+ end
63
+
64
+ alias_method :lemmatize_simple, :stem_simple
65
+
66
+ def categories(words)
67
+ output = run_jar(words.kind_of?(String) ? words.split : words)
68
+ @output_parser.parse_categories_only(output)
69
+ end
70
+
71
+ # Checks if given words have at least one common stem. Returns nil if analysis fail.
72
+ #
73
+ # @param [String] *words words to check
74
+ # @return [true, false, nil]
75
+ def equal_stems?(*words)
76
+ return nil if words.uniq.size < 2
77
+
78
+ stems = @output_parser.parse_stems_only(run_jar(words))
79
+ return nil unless words.uniq.size == stems.keys.size
80
+
81
+ not stems.values.inject(&:&).empty?
82
+ end
83
+
84
+ private
85
+
86
+ def run_jar(words)
87
+ cmd = "echo '#{words.uniq.join(' ')}' | java -jar #{@jar} plstem -ie #{@ie} -oe #{@oe}"
88
+
89
+ result = []
90
+ Open3.popen3(cmd) do |i, o, e, t|
91
+ o.each_line { |line| result << line unless line.start_with?('Processed') }
92
+ end
93
+
94
+ return result
95
+ end
96
+
97
+ def jar_file_exists?
98
+ File.exists?(@jar) and File.extname(@jar) == '.jar'
99
+ end
100
+
101
+ def default_jar
102
+ path = File.dirname(__FILE__) + '/morfologik/jar/morfologik-tools-1.5.2-standalone.jar'
103
+ Pathname.new(path).realpath.cleanpath.to_s
104
+ end
105
+
106
+ end
@@ -0,0 +1,61 @@
1
+ require 'morfologik/tagset_parser'
2
+
3
+ class Morfologik
4
+ class OutputParser
5
+
6
+ def initialize
7
+ @tagset_parser = TagsetParser.new
8
+ end
9
+
10
+ def parse(output)
11
+ output.inject({}) do |result, line|
12
+ word, stem, desc = line.split
13
+
14
+ if stem_found?(stem)
15
+ desc.split('+').each do |tags|
16
+ category, values = @tagset_parser.parse(tags)
17
+
18
+ morf = { :stem => stem, :category => category, :values => values }
19
+ result.has_key?(word) ? result[word] << morf : result[word] = [morf]
20
+ end
21
+ end
22
+
23
+ result
24
+ end
25
+ end
26
+
27
+ def parse_stems_only(output)
28
+ output.inject({}) do |result, line|
29
+ word, stem = line.split[0..1]
30
+
31
+ if stem_found?(stem)
32
+ result.has_key?(word) ? result[word] << stem : result[word] = [stem]
33
+ result[word].uniq!
34
+ end
35
+
36
+ result
37
+ end
38
+ end
39
+
40
+ def parse_categories_only(output)
41
+ output.inject({}) do |result, line|
42
+ word, stem, tags = line.split
43
+
44
+ if stem_found?(stem)
45
+ category = tags.split(':').first
46
+ result.has_key?(word) ? result[word] << category : result[word] = [category]
47
+ result[word].uniq!
48
+ end
49
+
50
+ result
51
+ end
52
+ end
53
+
54
+ private
55
+
56
+ def stem_found?(stem)
57
+ stem != '-'
58
+ end
59
+
60
+ end
61
+ end
@@ -0,0 +1,123 @@
1
+ class Morfologik
2
+ class TagsetParser
3
+
4
+ TAGS = {
5
+ "adj" => "pos",
6
+ "adja" => "pos",
7
+ "adjp" => "pos",
8
+ "adv" => "pos",
9
+ "num" => "pos",
10
+ "ppron12" => "pos",
11
+ "ppron3" => "pos",
12
+ "pred" => "pos",
13
+ "prep" => "pos",
14
+ "siebie" => "pos",
15
+ "subst" => "pos",
16
+ "verb" => "pos",
17
+ "conj" => "pos",
18
+ "qub" => "pos",
19
+ "burk" => "pos", # bound word
20
+ "interj" => "pos", # interjection
21
+ "interp" => "pos", # interpunction
22
+ "xxx" => "pos", # alien
23
+ "brev" => "pos", # abbreviation
24
+ "nie" => "pos",
25
+ "ign" => "pos",
26
+ "sg" => "number",
27
+ "pl" => "number",
28
+ "pltant" => "number",
29
+ "nom" => "case",
30
+ "gen" => "case",
31
+ "acc" => "case",
32
+ "dat" => "case",
33
+ "inst" => "case",
34
+ "loc" => "case",
35
+ "voc" => "case",
36
+ "pos" => "degree",
37
+ "comp" => "degree",
38
+ "sup" => "degree",
39
+ "m" => "gender",
40
+ "m1" => "gender",
41
+ "m2" => "gender",
42
+ "m3" => "gender",
43
+ "m4" => "gender",
44
+ "n" => "gender",
45
+ "f" => "gender",
46
+ "n1" => "gender",
47
+ "n2" => "gender",
48
+ "p1" => "gender",
49
+ "p2" => "gender",
50
+ "p3" => "gender",
51
+ "pri" => "person",
52
+ "sec" => "person",
53
+ "ter" => "person",
54
+ "depr" => "depreciativity",
55
+ "winien" => "winien",
56
+ "aff" => "negation",
57
+ "neg" => "negation",
58
+ "perf" => "aspect",
59
+ "imperf" => "aspect",
60
+ "?perf" => "aspect",
61
+ "nakc" => "accentability",
62
+ "akc" => "accentability",
63
+ "praep" => "post-prepositionality",
64
+ "npraep" => "post-prepositionality",
65
+ "ger" => "tense",
66
+ "imps" => "tense",
67
+ "inf" => "tense",
68
+ "fin" => "tense",
69
+ "bedzie" => "tense",
70
+ "praet" => "tense",
71
+ "refl" => "tense",
72
+ "pact" => "tense",
73
+ "pant" => "tense",
74
+ "pcon" => "tense",
75
+ "ppas" => "tense",
76
+ "impt" => "mode",
77
+ "pot" => "mode",
78
+ "indecl" => "uninflected",
79
+ "irreg" => "irregularity",
80
+ "pun" => "fullstoppedness",
81
+ "npun" => "fullstoppedness",
82
+ "wok" => "vocalicity",
83
+ "nwok" => "vocalicity",
84
+ "agl" => "agglutination",
85
+ "nagl" => "agglutination",
86
+ "_" => "unknown",
87
+ "congr" => "unknown",
88
+ "rec" => "unknown"
89
+ }
90
+
91
+ def parse(raw_tags)
92
+ tags = raw_tags.split(':')
93
+
94
+ category = find_part_of_speech(tags)
95
+ values = split_tags(tags)
96
+
97
+ return category, values
98
+ end
99
+
100
+ private
101
+
102
+ def find_part_of_speech(tags)
103
+ tags.shift
104
+ end
105
+
106
+ def split_tags(tags)
107
+ atom_tags = tags.map { |t| t.split('.') }
108
+ all = atom_tags.inject(1) { |c,t| c * t.size }
109
+
110
+ atom_tags.each_with_index do |tags, i|
111
+ atom_tags[i] += tags while atom_tags[i].size < all
112
+ end
113
+
114
+ result = []
115
+ all.times do |i|
116
+ result << atom_tags.map { |t| t[i] }.inject({}) { |hsh, t| hsh[TAGS[t]] = t; hsh }
117
+ end
118
+
119
+ return result
120
+ end
121
+
122
+ end
123
+ end
metadata ADDED
@@ -0,0 +1,50 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: morfologik
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.3
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - snukky
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-09-22 00:00:00.000000000 Z
13
+ dependencies: []
14
+ description: Ruby MRI bindings for morfologik-stemming library (Polish morphological
15
+ analyzer) written in Java.
16
+ email:
17
+ - snk987@gmail.com
18
+ executables: []
19
+ extensions: []
20
+ extra_rdoc_files: []
21
+ files:
22
+ - lib/morfologik.rb
23
+ - lib/morfologik/output_parser.rb
24
+ - lib/morfologik/tagset_parser.rb
25
+ - lib/morfologik/jar/morfologik-tools-1.5.2-standalone.jar
26
+ homepage: http://github.com/snukky/morfologik
27
+ licenses: []
28
+ post_install_message:
29
+ rdoc_options: []
30
+ require_paths:
31
+ - lib
32
+ required_ruby_version: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ required_rubygems_version: !ruby/object:Gem::Requirement
39
+ none: false
40
+ requirements:
41
+ - - ! '>='
42
+ - !ruby/object:Gem::Version
43
+ version: '0'
44
+ requirements: []
45
+ rubyforge_project:
46
+ rubygems_version: 1.8.10
47
+ signing_key:
48
+ specification_version: 3
49
+ summary: Ruby bindings for Morfologik.
50
+ test_files: []