morfologik 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/morfologik.rb ADDED
@@ -0,0 +1,106 @@
1
+ #encoding:utf-8
2
+
3
+ require 'pathname'
4
+ require 'open3'
5
+ require 'awesome_print'
6
+
7
+ require 'morfologik/output_parser'
8
+
9
+ class Morfologik
10
+
11
+ attr_reader :jar
12
+
13
+ def initialize(options={})
14
+ @jar = options[:jar] || default_jar
15
+ raise LoadError, "Morfologik .jar file not found" unless jar_file_exists?
16
+
17
+ @output_parser = OutputParser.new
18
+ @ie = options[:input_encoding] || 'UTF-8'
19
+ @oe = options[:output_encoding] || 'UTF-8'
20
+ end
21
+
22
+ # Stems words giving their stems, categories and tags.
23
+ #
24
+ # @param [String, Array<String>] words words to stem
25
+ # @return [Hash] analysis for each recognized word
26
+ # @example
27
+ # Morfologik.new.stem("ma")
28
+ # # => {
29
+ # "ma" => [
30
+ # {
31
+ # :stem => "mieć",
32
+ # :category => "verb",
33
+ # :values => [
34
+ # { "tense" => "fin", "number" => "sg", "person" => "ter", "aspect" => "imperf" }
35
+ # ]
36
+ # },
37
+ # {
38
+ # :stem => "mój",
39
+ # :category => "adj",
40
+ # :values => [
41
+ # { "number" => "sg", "case" => "nom", "gender" => "f", "degree" => "pos" },
42
+ # { "number" => "sg", "case" => "voc", "gender" => "f", "degree" => "pos" }
43
+ # ]
44
+ # }
45
+ # }
46
+ def stem(words)
47
+ output = run_jar(words.kind_of?(String) ? words.split : words)
48
+ @output_parser.parse(output)
49
+ end
50
+
51
+ alias_method :lemmatize, :stem
52
+
53
+ # Stems words giving only their stems.
54
+ #
55
+ # @param (see #stem)
56
+ # @return [Hash] stems for each recognized word
57
+ # @example
58
+ # # => { "ma" => [ "mieć", "mój" ] }
59
+ def stem_simple(words)
60
+ output = run_jar(words.kind_of?(String) ? words.split : words)
61
+ @output_parser.parse_stems_only(output)
62
+ end
63
+
64
+ alias_method :lemmatize_simple, :stem_simple
65
+
66
+ def categories(words)
67
+ output = run_jar(words.kind_of?(String) ? words.split : words)
68
+ @output_parser.parse_categories_only(output)
69
+ end
70
+
71
+ # Checks if given words have at least one common stem. Returns nil if analysis fail.
72
+ #
73
+ # @param [String] *words words to check
74
+ # @return [true, false, nil]
75
+ def equal_stems?(*words)
76
+ return nil if words.uniq.size < 2
77
+
78
+ stems = @output_parser.parse_stems_only(run_jar(words))
79
+ return nil unless words.uniq.size == stems.keys.size
80
+
81
+ not stems.values.inject(&:&).empty?
82
+ end
83
+
84
+ private
85
+
86
+ def run_jar(words)
87
+ cmd = "echo '#{words.uniq.join(' ')}' | java -jar #{@jar} plstem -ie #{@ie} -oe #{@oe}"
88
+
89
+ result = []
90
+ Open3.popen3(cmd) do |i, o, e, t|
91
+ o.each_line { |line| result << line unless line.start_with?('Processed') }
92
+ end
93
+
94
+ return result
95
+ end
96
+
97
+ def jar_file_exists?
98
+ File.exists?(@jar) and File.extname(@jar) == '.jar'
99
+ end
100
+
101
+ def default_jar
102
+ path = File.dirname(__FILE__) + '/morfologik/jar/morfologik-tools-1.5.2-standalone.jar'
103
+ Pathname.new(path).realpath.cleanpath.to_s
104
+ end
105
+
106
+ end
@@ -0,0 +1,61 @@
1
+ require 'morfologik/tagset_parser'
2
+
3
+ class Morfologik
4
+ class OutputParser
5
+
6
+ def initialize
7
+ @tagset_parser = TagsetParser.new
8
+ end
9
+
10
+ def parse(output)
11
+ output.inject({}) do |result, line|
12
+ word, stem, desc = line.split
13
+
14
+ if stem_found?(stem)
15
+ desc.split('+').each do |tags|
16
+ category, values = @tagset_parser.parse(tags)
17
+
18
+ morf = { :stem => stem, :category => category, :values => values }
19
+ result.has_key?(word) ? result[word] << morf : result[word] = [morf]
20
+ end
21
+ end
22
+
23
+ result
24
+ end
25
+ end
26
+
27
+ def parse_stems_only(output)
28
+ output.inject({}) do |result, line|
29
+ word, stem = line.split[0..1]
30
+
31
+ if stem_found?(stem)
32
+ result.has_key?(word) ? result[word] << stem : result[word] = [stem]
33
+ result[word].uniq!
34
+ end
35
+
36
+ result
37
+ end
38
+ end
39
+
40
+ def parse_categories_only(output)
41
+ output.inject({}) do |result, line|
42
+ word, stem, tags = line.split
43
+
44
+ if stem_found?(stem)
45
+ category = tags.split(':').first
46
+ result.has_key?(word) ? result[word] << category : result[word] = [category]
47
+ result[word].uniq!
48
+ end
49
+
50
+ result
51
+ end
52
+ end
53
+
54
+ private
55
+
56
+ def stem_found?(stem)
57
+ stem != '-'
58
+ end
59
+
60
+ end
61
+ end
@@ -0,0 +1,123 @@
1
+ class Morfologik
2
+ class TagsetParser
3
+
4
+ TAGS = {
5
+ "adj" => "pos",
6
+ "adja" => "pos",
7
+ "adjp" => "pos",
8
+ "adv" => "pos",
9
+ "num" => "pos",
10
+ "ppron12" => "pos",
11
+ "ppron3" => "pos",
12
+ "pred" => "pos",
13
+ "prep" => "pos",
14
+ "siebie" => "pos",
15
+ "subst" => "pos",
16
+ "verb" => "pos",
17
+ "conj" => "pos",
18
+ "qub" => "pos",
19
+ "burk" => "pos", # bound word
20
+ "interj" => "pos", # interjection
21
+ "interp" => "pos", # interpunction
22
+ "xxx" => "pos", # alien
23
+ "brev" => "pos", # abbreviation
24
+ "nie" => "pos",
25
+ "ign" => "pos",
26
+ "sg" => "number",
27
+ "pl" => "number",
28
+ "pltant" => "number",
29
+ "nom" => "case",
30
+ "gen" => "case",
31
+ "acc" => "case",
32
+ "dat" => "case",
33
+ "inst" => "case",
34
+ "loc" => "case",
35
+ "voc" => "case",
36
+ "pos" => "degree",
37
+ "comp" => "degree",
38
+ "sup" => "degree",
39
+ "m" => "gender",
40
+ "m1" => "gender",
41
+ "m2" => "gender",
42
+ "m3" => "gender",
43
+ "m4" => "gender",
44
+ "n" => "gender",
45
+ "f" => "gender",
46
+ "n1" => "gender",
47
+ "n2" => "gender",
48
+ "p1" => "gender",
49
+ "p2" => "gender",
50
+ "p3" => "gender",
51
+ "pri" => "person",
52
+ "sec" => "person",
53
+ "ter" => "person",
54
+ "depr" => "depreciativity",
55
+ "winien" => "winien",
56
+ "aff" => "negation",
57
+ "neg" => "negation",
58
+ "perf" => "aspect",
59
+ "imperf" => "aspect",
60
+ "?perf" => "aspect",
61
+ "nakc" => "accentability",
62
+ "akc" => "accentability",
63
+ "praep" => "post-prepositionality",
64
+ "npraep" => "post-prepositionality",
65
+ "ger" => "tense",
66
+ "imps" => "tense",
67
+ "inf" => "tense",
68
+ "fin" => "tense",
69
+ "bedzie" => "tense",
70
+ "praet" => "tense",
71
+ "refl" => "tense",
72
+ "pact" => "tense",
73
+ "pant" => "tense",
74
+ "pcon" => "tense",
75
+ "ppas" => "tense",
76
+ "impt" => "mode",
77
+ "pot" => "mode",
78
+ "indecl" => "uninflected",
79
+ "irreg" => "irregularity",
80
+ "pun" => "fullstoppedness",
81
+ "npun" => "fullstoppedness",
82
+ "wok" => "vocalicity",
83
+ "nwok" => "vocalicity",
84
+ "agl" => "agglutination",
85
+ "nagl" => "agglutination",
86
+ "_" => "unknown",
87
+ "congr" => "unknown",
88
+ "rec" => "unknown"
89
+ }
90
+
91
+ def parse(raw_tags)
92
+ tags = raw_tags.split(':')
93
+
94
+ category = find_part_of_speech(tags)
95
+ values = split_tags(tags)
96
+
97
+ return category, values
98
+ end
99
+
100
+ private
101
+
102
+ def find_part_of_speech(tags)
103
+ tags.shift
104
+ end
105
+
106
+ def split_tags(tags)
107
+ atom_tags = tags.map { |t| t.split('.') }
108
+ all = atom_tags.inject(1) { |c,t| c * t.size }
109
+
110
+ atom_tags.each_with_index do |tags, i|
111
+ atom_tags[i] += tags while atom_tags[i].size < all
112
+ end
113
+
114
+ result = []
115
+ all.times do |i|
116
+ result << atom_tags.map { |t| t[i] }.inject({}) { |hsh, t| hsh[TAGS[t]] = t; hsh }
117
+ end
118
+
119
+ return result
120
+ end
121
+
122
+ end
123
+ end
metadata ADDED
@@ -0,0 +1,50 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: morfologik
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.3
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - snukky
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-09-22 00:00:00.000000000 Z
13
+ dependencies: []
14
+ description: Ruby MRI bindings for morfologik-stemming library (Polish morphological
15
+ analyzer) written in Java.
16
+ email:
17
+ - snk987@gmail.com
18
+ executables: []
19
+ extensions: []
20
+ extra_rdoc_files: []
21
+ files:
22
+ - lib/morfologik.rb
23
+ - lib/morfologik/output_parser.rb
24
+ - lib/morfologik/tagset_parser.rb
25
+ - lib/morfologik/jar/morfologik-tools-1.5.2-standalone.jar
26
+ homepage: http://github.com/snukky/morfologik
27
+ licenses: []
28
+ post_install_message:
29
+ rdoc_options: []
30
+ require_paths:
31
+ - lib
32
+ required_ruby_version: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ required_rubygems_version: !ruby/object:Gem::Requirement
39
+ none: false
40
+ requirements:
41
+ - - ! '>='
42
+ - !ruby/object:Gem::Version
43
+ version: '0'
44
+ requirements: []
45
+ rubyforge_project:
46
+ rubygems_version: 1.8.10
47
+ signing_key:
48
+ specification_version: 3
49
+ summary: Ruby bindings for Morfologik.
50
+ test_files: []