groonga-synonym 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/Gemfile +24 -0
- data/LICENSE.txt +674 -0
- data/README.md +21 -0
- data/Rakefile +36 -0
- data/bin/groonga-synonym-generate +21 -0
- data/groonga-synonym.gemspec +51 -0
- data/lib/groonga-synonym.rb +20 -0
- data/lib/groonga-synonym/command-line/generator.rb +176 -0
- data/lib/groonga-synonym/groonga-generator.rb +55 -0
- data/lib/groonga-synonym/pgroonga-generator.rb +59 -0
- data/lib/groonga-synonym/sudachi.rb +122 -0
- data/lib/groonga-synonym/synonym.rb +61 -0
- data/lib/groonga-synonym/version.rb +18 -0
- metadata +71 -0
@@ -0,0 +1,122 @@
|
|
1
|
+
# Copyright (C) 2021 Sutou Kouhei <kou@clear-code.com>
|
2
|
+
#
|
3
|
+
# This program is free software: you can redistribute it and/or modify
|
4
|
+
# it under the terms of the GNU General Public License as published by
|
5
|
+
# the Free Software Foundation, either version 3 of the License, or
|
6
|
+
# (at your option) any later version.
|
7
|
+
#
|
8
|
+
# This program is distributed in the hope that it will be useful,
|
9
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
10
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
11
|
+
# GNU General Public License for more details.
|
12
|
+
#
|
13
|
+
# You should have received a copy of the GNU General Public License
|
14
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
15
|
+
|
16
|
+
require "datasets"
|
17
|
+
|
18
|
+
require_relative "synonym"
|
19
|
+
|
20
|
+
module GroongaSynonym
|
21
|
+
class Sudachi
|
22
|
+
include Enumerable
|
23
|
+
|
24
|
+
def initialize
|
25
|
+
@dataset = Datasets::SudachiSynonymDictionary.new
|
26
|
+
end
|
27
|
+
|
28
|
+
def each
|
29
|
+
return to_enum(__method__) unless block_given?
|
30
|
+
|
31
|
+
groups = {}
|
32
|
+
group_id = nil
|
33
|
+
group = nil
|
34
|
+
@dataset.each do |synonym|
|
35
|
+
if synonym.group_id != group_id
|
36
|
+
emit_synonyms(groups, group)
|
37
|
+
group_id = synonym.group_id
|
38
|
+
group = [synonym]
|
39
|
+
else
|
40
|
+
group << synonym
|
41
|
+
end
|
42
|
+
end
|
43
|
+
emit_synonyms(groups, group)
|
44
|
+
filter_groups(groups) do |term, synonyms|
|
45
|
+
yield(term, synonyms)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
private
|
50
|
+
def emit_synonyms(groups, group)
|
51
|
+
return if group.nil?
|
52
|
+
target_synonyms = group.reject do |synonym|
|
53
|
+
synonym.expansion_type == :never
|
54
|
+
end
|
55
|
+
return if target_synonyms.size <= 1
|
56
|
+
target_synonyms.each_with_index do |typical, i|
|
57
|
+
next unless typical.expansion_type == :always
|
58
|
+
term = typical.notation
|
59
|
+
synonyms = []
|
60
|
+
target_synonyms.each_with_index do |synonym, j|
|
61
|
+
if i == j
|
62
|
+
weight = nil
|
63
|
+
elsif synonym.lexeme_id == typical.lexeme_id
|
64
|
+
weight = 0.8
|
65
|
+
else
|
66
|
+
weight = 0.6
|
67
|
+
end
|
68
|
+
synonyms << Synonym.new(synonym.notation, weight)
|
69
|
+
end
|
70
|
+
# e.g.: 働き手
|
71
|
+
if groups.key?(term)
|
72
|
+
groups[term] |= synonyms
|
73
|
+
else
|
74
|
+
groups[term] = synonyms
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
def filter_groups(groups)
|
80
|
+
groups.each do |term, synonyms|
|
81
|
+
typical_synonym = nil
|
82
|
+
other_synonyms = []
|
83
|
+
synonyms.each do |synonym|
|
84
|
+
if synonym.weight.nil?
|
85
|
+
typical_synonym = synonym
|
86
|
+
else
|
87
|
+
other_synonyms << synonym
|
88
|
+
end
|
89
|
+
end
|
90
|
+
others_sub_synonyms = []
|
91
|
+
sub_synonyms = []
|
92
|
+
super_synonyms = []
|
93
|
+
other_synonyms.each do |synonym|
|
94
|
+
is_sub_synonym = other_synonyms.any? do |other_synonym|
|
95
|
+
other_synonym != synonym and
|
96
|
+
synonym.term.include?(other_synonym.term)
|
97
|
+
end
|
98
|
+
if is_sub_synonym
|
99
|
+
others_sub_synonyms << synonym
|
100
|
+
elsif term.include?(synonym.term)
|
101
|
+
sub_synonyms << synonym
|
102
|
+
elsif synonym.term.include?(term)
|
103
|
+
super_synonyms << synonym
|
104
|
+
end
|
105
|
+
end
|
106
|
+
synonyms -= others_sub_synonyms
|
107
|
+
synonyms -= super_synonyms
|
108
|
+
unless sub_synonyms.empty?
|
109
|
+
sorted_sub_synonyms = sub_synonyms.sort_by do |synonym|
|
110
|
+
synonym.term.size
|
111
|
+
end
|
112
|
+
typical_sub_synonym, *other_sub_synonyms = sorted_sub_synonyms
|
113
|
+
synonyms -= other_sub_synonyms
|
114
|
+
synonyms.delete(typical_synonym)
|
115
|
+
synonyms << Synonym.new(typical_synonym.term,
|
116
|
+
(1.0 - typical_sub_synonym.weight).round(2))
|
117
|
+
end
|
118
|
+
yield(term, synonyms)
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
122
|
+
end
|
@@ -0,0 +1,61 @@
|
|
1
|
+
# Copyright (C) 2021 Sutou Kouhei <kou@clear-code.com>
|
2
|
+
#
|
3
|
+
# This program is free software: you can redistribute it and/or modify
|
4
|
+
# it under the terms of the GNU General Public License as published by
|
5
|
+
# the Free Software Foundation, either version 3 of the License, or
|
6
|
+
# (at your option) any later version.
|
7
|
+
#
|
8
|
+
# This program is distributed in the hope that it will be useful,
|
9
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
10
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
11
|
+
# GNU General Public License for more details.
|
12
|
+
#
|
13
|
+
# You should have received a copy of the GNU General Public License
|
14
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
15
|
+
|
16
|
+
module GroongaSynonym
|
17
|
+
class Synonym
|
18
|
+
attr_reader :term
|
19
|
+
attr_reader :weight
|
20
|
+
def initialize(term, weight=nil)
|
21
|
+
@term = term
|
22
|
+
@weight = weight
|
23
|
+
end
|
24
|
+
|
25
|
+
def to_groonga
|
26
|
+
formatted = ""
|
27
|
+
if @weight and @weight != 1.0
|
28
|
+
formatted << ">" << ("%f" % (@weight - 1)).gsub(/0+\z/, "")
|
29
|
+
end
|
30
|
+
formatted << escape_term(@term)
|
31
|
+
formatted
|
32
|
+
end
|
33
|
+
|
34
|
+
def ==(other)
|
35
|
+
other.is_a?(self.class) and
|
36
|
+
@term == other.term and
|
37
|
+
@weight == other.weight
|
38
|
+
end
|
39
|
+
|
40
|
+
def eql?(other)
|
41
|
+
self == other
|
42
|
+
end
|
43
|
+
|
44
|
+
def hash
|
45
|
+
[@term, @weight].hash
|
46
|
+
end
|
47
|
+
|
48
|
+
private
|
49
|
+
def escape_term(term)
|
50
|
+
return "\"#{term}\"" if term == "OR"
|
51
|
+
term = term.gsub(/["()\\*:+-]/) do |matched|
|
52
|
+
"\\#{matched}"
|
53
|
+
end
|
54
|
+
if term.include?(" ")
|
55
|
+
"\"#{term}\""
|
56
|
+
else
|
57
|
+
term
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
# Copyright (C) 2021 Sutou Kouhei <kou@clear-code.com>
|
2
|
+
#
|
3
|
+
# This program is free software: you can redistribute it and/or modify
|
4
|
+
# it under the terms of the GNU General Public License as published by
|
5
|
+
# the Free Software Foundation, either version 3 of the License, or
|
6
|
+
# (at your option) any later version.
|
7
|
+
#
|
8
|
+
# This program is distributed in the hope that it will be useful,
|
9
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
10
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
11
|
+
# GNU General Public License for more details.
|
12
|
+
#
|
13
|
+
# You should have received a copy of the GNU General Public License
|
14
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
15
|
+
|
16
|
+
module GroongaSynonym
|
17
|
+
VERSION = "1.0.0"
|
18
|
+
end
|
metadata
ADDED
@@ -0,0 +1,71 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: groonga-synonym
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Sutou Kouhei
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2021-07-15 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: red-datasets
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 0.1.3
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 0.1.3
|
27
|
+
description: ''
|
28
|
+
email:
|
29
|
+
- kou@clear-code.com
|
30
|
+
executables:
|
31
|
+
- groonga-synonym-generate
|
32
|
+
extensions: []
|
33
|
+
extra_rdoc_files: []
|
34
|
+
files:
|
35
|
+
- Gemfile
|
36
|
+
- LICENSE.txt
|
37
|
+
- README.md
|
38
|
+
- Rakefile
|
39
|
+
- bin/groonga-synonym-generate
|
40
|
+
- groonga-synonym.gemspec
|
41
|
+
- lib/groonga-synonym.rb
|
42
|
+
- lib/groonga-synonym/command-line/generator.rb
|
43
|
+
- lib/groonga-synonym/groonga-generator.rb
|
44
|
+
- lib/groonga-synonym/pgroonga-generator.rb
|
45
|
+
- lib/groonga-synonym/sudachi.rb
|
46
|
+
- lib/groonga-synonym/synonym.rb
|
47
|
+
- lib/groonga-synonym/version.rb
|
48
|
+
homepage: https://github.com/groonga/groonga-synonym
|
49
|
+
licenses:
|
50
|
+
- GPL-3.0+
|
51
|
+
metadata: {}
|
52
|
+
post_install_message:
|
53
|
+
rdoc_options: []
|
54
|
+
require_paths:
|
55
|
+
- lib
|
56
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
57
|
+
requirements:
|
58
|
+
- - ">="
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
version: '0'
|
61
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
62
|
+
requirements:
|
63
|
+
- - ">="
|
64
|
+
- !ruby/object:Gem::Version
|
65
|
+
version: '0'
|
66
|
+
requirements: []
|
67
|
+
rubygems_version: 3.3.0.dev
|
68
|
+
signing_key:
|
69
|
+
specification_version: 4
|
70
|
+
summary: Groonga synonym provides tools for synonym of Groonga families.
|
71
|
+
test_files: []
|