sekka 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,59 @@
1
+ :; #-*- mode: nendo; syntax: scheme -*-;;
2
+ ;;;
3
+ ;;; alphabet-lib.nnd - アルファベットの変換ライブラリ
4
+ ;;;
5
+ ;;; Copyright (c) 2010 Kiyoka Nishiyama <kiyoka@sumibi.org>
6
+ ;;;
7
+ ;;; Redistribution and use in source and binary forms, with or without
8
+ ;;; modification, are permitted provided that the following conditions
9
+ ;;; are met:
10
+ ;;;
11
+ ;;; 1. Redistributions of source code must retain the above copyright
12
+ ;;; notice, this list of conditions and the following disclaimer.
13
+ ;;;
14
+ ;;; 2. Redistributions in binary form must reproduce the above copyright
15
+ ;;; notice, this list of conditions and the following disclaimer in the
16
+ ;;; documentation and/or other materials provided with the distribution.
17
+ ;;;
18
+ ;;; 3. Neither the name of the authors nor the names of its contributors
19
+ ;;; may be used to endorse or promote products derived from this
20
+ ;;; software without specific prior written permission.
21
+ ;;;
22
+ ;;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23
+ ;;; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24
+ ;;; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
25
+ ;;; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
26
+ ;;; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27
+ ;;; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
28
+ ;;; TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
29
+ ;;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
30
+ ;;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
31
+ ;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
32
+ ;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33
+ ;;;
34
+ ;;; $Id:
35
+ ;;;
36
+
37
+ ;; アルファベットの 半角->全角 変換
38
+ (define (gen-alphabet-han->zen str)
39
+ (str.tr "!-}" "!-}"))
40
+
41
+ ;; アルファベットの 全角->半角 変換
42
+ (define (gen-alphabet-zen->han str)
43
+ (str.tr "!-}" "!-}"))
44
+
45
+ ;; アルファベットの 全角かどうか調べる
46
+ (define (is-alphabet-zenkaku str)
47
+ (if (rxmatch #/^[!-}]+$/ str) #t #f))
48
+
49
+ ;; アルファベットの 半角かどうか調べる
50
+ (define (is-alphabet-hankaku str)
51
+ (if (rxmatch #/^[!-}]+$/ str) #t #f))
52
+
53
+ ;; アルファベットの 全角が含まれているか調べる
54
+ (define (include-alphabet-zenkaku str)
55
+ (if (rxmatch #/[!-}]+/ str) #t #f))
56
+
57
+ ;; アルファベットの 半角が含まれているか調べる
58
+ (define (include-alphabet-hankaku str)
59
+ (if (rxmatch #/[!-}]+/ str) #t #f))
@@ -0,0 +1,72 @@
1
+ # approximatesearch.rb - "approximate search library"
2
+ #
3
+ # Copyright (c) 2010 Kiyoka Nishiyama <kiyoka@sumibi.org>
4
+ #
5
+ # Redistribution and use in source and binary forms, with or without
6
+ # modification, are permitted provided that the following conditions
7
+ # are met:
8
+ #
9
+ # 1. Redistributions of source code must retain the above copyright
10
+ # notice, this list of conditions and the following disclaimer.
11
+ #
12
+ # 2. Redistributions in binary form must reproduce the above copyright
13
+ # notice, this list of conditions and the following disclaimer in the
14
+ # documentation and/or other materials provided with the distribution.
15
+ #
16
+ # 3. Neither the name of the authors nor the names of its contributors
17
+ # may be used to endorse or promote products derived from this
18
+ # software without specific prior written permission.
19
+ #
20
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21
+ # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22
+ # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23
+ # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24
+ # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25
+ # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
26
+ # TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27
+ # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28
+ # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29
+ # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30
+ # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31
+ #
32
+ # $Id:
33
+ #
34
+ require 'fuzzystringmatch'
35
+ require 'sekka/kvs'
36
+
37
+ class ApproximateSearch
38
+ def initialize( jarow_shikii )
39
+ @jarow_shikii = jarow_shikii
40
+ @jarow = FuzzyStringMatch::JaroWinkler.new.create( :native )
41
+ end
42
+
43
+ def filtering( keyword, arr )
44
+ keyword = keyword.downcase
45
+ arr.map { |str|
46
+ val = @jarow.getDistance( keyword, str.downcase )
47
+ #printf( " [%s] vs [%s] => %f\n", keyword, str.downcase, val )
48
+ (val > @jarow_shikii) ? [ val, str ] : false
49
+ }.select { |v| v }.sort_by {|item| 1.0 - item[0]}
50
+ end
51
+
52
+ def search( userid, kvs, keyword, okuri_ari )
53
+ readymade_key = if okuri_ari
54
+ keyword.slice( 0, 2 ).upcase
55
+ else
56
+ keyword.slice( 0, 2 ).downcase
57
+ end
58
+ readymade_key = "(" + readymade_key + ")"
59
+
60
+ str = kvs.get( userid + "::" + readymade_key, false )
61
+ if not str
62
+ str = kvs.get( "MASTER::" + readymade_key )
63
+ end
64
+
65
+ #printf( "#readymade_key %s : %s\n", readymade_key, str )
66
+ if str
67
+ filtering( keyword, str.split( /[ ]+/ ))
68
+ else
69
+ [ ]
70
+ end
71
+ end
72
+ end
@@ -0,0 +1,129 @@
1
+ :; #-*- mode: nendo; syntax: scheme -*-;;
2
+ ;;;
3
+ ;;; convert-jisyo.nnd - SKK-JISYO形式から SEKKA-JISYO形式へのコンバートロジック
4
+ ;;;
5
+ ;;; Copyright (c) 2010 Kiyoka Nishiyama <kiyoka@sumibi.org>
6
+ ;;;
7
+ ;;; Redistribution and use in source and binary forms, with or without
8
+ ;;; modification, are permitted provided that the following conditions
9
+ ;;; are met:
10
+ ;;;
11
+ ;;; 1. Redistributions of source code must retain the above copyright
12
+ ;;; notice, this list of conditions and the following disclaimer.
13
+ ;;;
14
+ ;;; 2. Redistributions in binary form must reproduce the above copyright
15
+ ;;; notice, this list of conditions and the following disclaimer in the
16
+ ;;; documentation and/or other materials provided with the distribution.
17
+ ;;;
18
+ ;;; 3. Neither the name of the authors nor the names of its contributors
19
+ ;;; may be used to endorse or promote products derived from this
20
+ ;;; software without specific prior written permission.
21
+ ;;;
22
+ ;;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23
+ ;;; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24
+ ;;; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
25
+ ;;; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
26
+ ;;; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27
+ ;;; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
28
+ ;;; TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
29
+ ;;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
30
+ ;;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
31
+ ;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
32
+ ;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33
+ ;;;
34
+ ;;; $Id:
35
+ ;;;
36
+ (use srfi-1)
37
+ (use sekka.util)
38
+ (use sekka.roman-lib)
39
+
40
+
41
+ (define (expand-okuri-nashi-entry key value)
42
+ (let1 roman-list (gen-hiragana->roman-list key)
43
+ (if (< 1000 (length roman-list))
44
+ (begin
45
+ (STDERR.printf " Warning: ignored entry [%s %s] , because too many pattens.\n" key value)
46
+ #f) ;; パターン数が爆発した単語は無視する
47
+ (append
48
+ (map
49
+ (lambda (x)
50
+ (cons x (+ "C" key)))
51
+ roman-list)
52
+ (list (cons key value))))))
53
+
54
+
55
+ (define (expand-okuri-ari-entry-internal key okuri value)
56
+ (let1 roman-list (gen-hiragana->roman-list key)
57
+ (append
58
+ (map
59
+ (lambda (x)
60
+ (cons (+ x (sekka-upcase okuri)) (+ "C" key okuri)))
61
+ roman-list)
62
+ (list (cons (+ key okuri) value)))))
63
+
64
+ (define (expand-okuri-ari-entry key okuri value)
65
+ (cond
66
+ ((eq? "t" okuri)
67
+ (append-map (lambda (x) x)
68
+ (list
69
+ (expand-okuri-ari-entry-internal key okuri value)
70
+ (expand-okuri-ari-entry-internal key (sekka-upcase "@") value)
71
+ (expand-okuri-ari-entry-internal key (sekka-upcase ";") value))))
72
+ (else
73
+ (expand-okuri-ari-entry-internal key okuri value))))
74
+
75
+
76
+ (define (convert-skk-jisyo-f f)
77
+ (define total 0)
78
+ (define current 0)
79
+
80
+ (define (display-progress line)
81
+ (set! current (+ current 1))
82
+ (when (= 0 (% current 10000))
83
+ (STDERR.printf " %7d/%7d (%3.3f%)\n" current total (* (/ current (total.to_f)) 100.0))))
84
+
85
+ (define (gen-sekka-entries line)
86
+ (display-progress line)
87
+
88
+ (let* ((line (line.sub #/\/$/ ""))
89
+ (fields (split-dict-line line)))
90
+ (cond
91
+ ((rxmatch #/^\;/ line)
92
+ ;; コメント行
93
+ #f)
94
+ ((not fields)
95
+ ;; フォーマットエラー
96
+ #f)
97
+ ((or (is-hiragana (first fields))
98
+ (rxmatch #/^([>あ-ん]+)$/ (first fields)))
99
+ ;; 送り仮名なしデータ
100
+ (expand-okuri-nashi-entry (first fields) (second fields)))
101
+ ((rxmatch #/^([>あ-ん]+)([a-z])$/ (first fields))
102
+ => (lambda (m)
103
+ ;; 送り仮名ありデータ
104
+ (expand-okuri-ari-entry (rxmatch-substring m 1)
105
+ (rxmatch-substring m 2)
106
+ (second fields))))
107
+ ((rxmatch #/[亜-瑤]+/ (first fields))
108
+ ;; 漢字が1文字でも含まれている
109
+ #f)
110
+ (else
111
+ (list (cons (first fields) (second fields)))))))
112
+
113
+ (let* ((lines
114
+ (map
115
+ (lambda (line)
116
+ (line.chomp))
117
+ (f.readlines.to_list)))
118
+ (_ (set! total (length lines)))
119
+ (entry-list
120
+ (filter
121
+ (lambda (x) x)
122
+ (map gen-sekka-entries lines))))
123
+ (map
124
+ (lambda (entry)
125
+ (sprintf "%s %s" (car entry) (cdr entry)))
126
+ (apply append! entry-list))))
127
+
128
+
129
+