groonga-synonym 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,122 @@
1
+ # Copyright (C) 2021 Sutou Kouhei <kou@clear-code.com>
2
+ #
3
+ # This program is free software: you can redistribute it and/or modify
4
+ # it under the terms of the GNU General Public License as published by
5
+ # the Free Software Foundation, either version 3 of the License, or
6
+ # (at your option) any later version.
7
+ #
8
+ # This program is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
+ # GNU General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU General Public License
14
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
+
16
+ require "datasets"
17
+
18
+ require_relative "synonym"
19
+
20
+ module GroongaSynonym
21
+ class Sudachi
22
+ include Enumerable
23
+
24
+ def initialize
25
+ @dataset = Datasets::SudachiSynonymDictionary.new
26
+ end
27
+
28
+ def each
29
+ return to_enum(__method__) unless block_given?
30
+
31
+ groups = {}
32
+ group_id = nil
33
+ group = nil
34
+ @dataset.each do |synonym|
35
+ if synonym.group_id != group_id
36
+ emit_synonyms(groups, group)
37
+ group_id = synonym.group_id
38
+ group = [synonym]
39
+ else
40
+ group << synonym
41
+ end
42
+ end
43
+ emit_synonyms(groups, group)
44
+ filter_groups(groups) do |term, synonyms|
45
+ yield(term, synonyms)
46
+ end
47
+ end
48
+
49
+ private
50
+ def emit_synonyms(groups, group)
51
+ return if group.nil?
52
+ target_synonyms = group.reject do |synonym|
53
+ synonym.expansion_type == :never
54
+ end
55
+ return if target_synonyms.size <= 1
56
+ target_synonyms.each_with_index do |typical, i|
57
+ next unless typical.expansion_type == :always
58
+ term = typical.notation
59
+ synonyms = []
60
+ target_synonyms.each_with_index do |synonym, j|
61
+ if i == j
62
+ weight = nil
63
+ elsif synonym.lexeme_id == typical.lexeme_id
64
+ weight = 0.8
65
+ else
66
+ weight = 0.6
67
+ end
68
+ synonyms << Synonym.new(synonym.notation, weight)
69
+ end
70
+ # e.g.: 働き手
71
+ if groups.key?(term)
72
+ groups[term] |= synonyms
73
+ else
74
+ groups[term] = synonyms
75
+ end
76
+ end
77
+ end
78
+
79
+ def filter_groups(groups)
80
+ groups.each do |term, synonyms|
81
+ typical_synonym = nil
82
+ other_synonyms = []
83
+ synonyms.each do |synonym|
84
+ if synonym.weight.nil?
85
+ typical_synonym = synonym
86
+ else
87
+ other_synonyms << synonym
88
+ end
89
+ end
90
+ others_sub_synonyms = []
91
+ sub_synonyms = []
92
+ super_synonyms = []
93
+ other_synonyms.each do |synonym|
94
+ is_sub_synonym = other_synonyms.any? do |other_synonym|
95
+ other_synonym != synonym and
96
+ synonym.term.include?(other_synonym.term)
97
+ end
98
+ if is_sub_synonym
99
+ others_sub_synonyms << synonym
100
+ elsif term.include?(synonym.term)
101
+ sub_synonyms << synonym
102
+ elsif synonym.term.include?(term)
103
+ super_synonyms << synonym
104
+ end
105
+ end
106
+ synonyms -= others_sub_synonyms
107
+ synonyms -= super_synonyms
108
+ unless sub_synonyms.empty?
109
+ sorted_sub_synonyms = sub_synonyms.sort_by do |synonym|
110
+ synonym.term.size
111
+ end
112
+ typical_sub_synonym, *other_sub_synonyms = sorted_sub_synonyms
113
+ synonyms -= other_sub_synonyms
114
+ synonyms.delete(typical_synonym)
115
+ synonyms << Synonym.new(typical_synonym.term,
116
+ (1.0 - typical_sub_synonym.weight).round(2))
117
+ end
118
+ yield(term, synonyms)
119
+ end
120
+ end
121
+ end
122
+ end
@@ -0,0 +1,61 @@
1
+ # Copyright (C) 2021 Sutou Kouhei <kou@clear-code.com>
2
+ #
3
+ # This program is free software: you can redistribute it and/or modify
4
+ # it under the terms of the GNU General Public License as published by
5
+ # the Free Software Foundation, either version 3 of the License, or
6
+ # (at your option) any later version.
7
+ #
8
+ # This program is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
+ # GNU General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU General Public License
14
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
+
16
+ module GroongaSynonym
17
+ class Synonym
18
+ attr_reader :term
19
+ attr_reader :weight
20
+ def initialize(term, weight=nil)
21
+ @term = term
22
+ @weight = weight
23
+ end
24
+
25
+ def to_groonga
26
+ formatted = ""
27
+ if @weight and @weight != 1.0
28
+ formatted << ">" << ("%f" % (@weight - 1)).gsub(/0+\z/, "")
29
+ end
30
+ formatted << escape_term(@term)
31
+ formatted
32
+ end
33
+
34
+ def ==(other)
35
+ other.is_a?(self.class) and
36
+ @term == other.term and
37
+ @weight == other.weight
38
+ end
39
+
40
+ def eql?(other)
41
+ self == other
42
+ end
43
+
44
+ def hash
45
+ [@term, @weight].hash
46
+ end
47
+
48
+ private
49
+ def escape_term(term)
50
+ return "\"#{term}\"" if term == "OR"
51
+ term = term.gsub(/["()\\*:+-]/) do |matched|
52
+ "\\#{matched}"
53
+ end
54
+ if term.include?(" ")
55
+ "\"#{term}\""
56
+ else
57
+ term
58
+ end
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,18 @@
1
+ # Copyright (C) 2021 Sutou Kouhei <kou@clear-code.com>
2
+ #
3
+ # This program is free software: you can redistribute it and/or modify
4
+ # it under the terms of the GNU General Public License as published by
5
+ # the Free Software Foundation, either version 3 of the License, or
6
+ # (at your option) any later version.
7
+ #
8
+ # This program is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
+ # GNU General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU General Public License
14
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
+
16
+ module GroongaSynonym
17
+ VERSION = "1.0.0"
18
+ end
metadata ADDED
@@ -0,0 +1,71 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: groonga-synonym
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ platform: ruby
6
+ authors:
7
+ - Sutou Kouhei
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2021-07-15 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: red-datasets
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: 0.1.3
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: 0.1.3
27
+ description: ''
28
+ email:
29
+ - kou@clear-code.com
30
+ executables:
31
+ - groonga-synonym-generate
32
+ extensions: []
33
+ extra_rdoc_files: []
34
+ files:
35
+ - Gemfile
36
+ - LICENSE.txt
37
+ - README.md
38
+ - Rakefile
39
+ - bin/groonga-synonym-generate
40
+ - groonga-synonym.gemspec
41
+ - lib/groonga-synonym.rb
42
+ - lib/groonga-synonym/command-line/generator.rb
43
+ - lib/groonga-synonym/groonga-generator.rb
44
+ - lib/groonga-synonym/pgroonga-generator.rb
45
+ - lib/groonga-synonym/sudachi.rb
46
+ - lib/groonga-synonym/synonym.rb
47
+ - lib/groonga-synonym/version.rb
48
+ homepage: https://github.com/groonga/groonga-synonym
49
+ licenses:
50
+ - GPL-3.0+
51
+ metadata: {}
52
+ post_install_message:
53
+ rdoc_options: []
54
+ require_paths:
55
+ - lib
56
+ required_ruby_version: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - ">="
59
+ - !ruby/object:Gem::Version
60
+ version: '0'
61
+ required_rubygems_version: !ruby/object:Gem::Requirement
62
+ requirements:
63
+ - - ">="
64
+ - !ruby/object:Gem::Version
65
+ version: '0'
66
+ requirements: []
67
+ rubygems_version: 3.3.0.dev
68
+ signing_key:
69
+ specification_version: 4
70
+ summary: Groonga synonym provides tools for synonym of Groonga families.
71
+ test_files: []