docdiff 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +6 -0
- data/.travis.yml +7 -0
- data/Gemfile +17 -0
- data/Guardfile +8 -0
- data/Makefile +108 -0
- data/Rakefile +17 -0
- data/bin/docdiff +179 -0
- data/devutil/JIS0208.TXT +6952 -0
- data/devutil/char_by_charclass.rb +23 -0
- data/devutil/charclass_by_char.rb +21 -0
- data/devutil/jis0208.rb +343 -0
- data/devutil/testjis0208.rb +38 -0
- data/docdiff.conf.example +22 -0
- data/docdiff.gemspec +23 -0
- data/docdiffwebui.cgi +176 -0
- data/docdiffwebui.html +123 -0
- data/img/docdiff-screenshot-format-html-digest-firefox.png +0 -0
- data/img/docdiff-screenshot-format-html-firefox.png +0 -0
- data/img/docdiff-screenshot-format-tty-cmdexe-en.png +0 -0
- data/img/docdiff-screenshot-format-tty-cmdexe-ja.png +0 -0
- data/img/docdiff-screenshot-format-tty-rxvtunicode-en.png +0 -0
- data/img/docdiff-screenshot-format-tty-rxvtunicode-ja.png +0 -0
- data/img/docdiff-screenshot-format-tty-xterm-en.png +0 -0
- data/img/docdiff-screenshot-format-tty-xterm-ja.png +0 -0
- data/img/docdiff-screenshot-resolution-linewordchar-xterm.png +0 -0
- data/index.html +181 -0
- data/langfilter.rb +14 -0
- data/lib/doc_diff.rb +170 -0
- data/lib/docdiff.rb +7 -0
- data/lib/docdiff/charstring.rb +579 -0
- data/lib/docdiff/diff.rb +217 -0
- data/lib/docdiff/diff/contours.rb +382 -0
- data/lib/docdiff/diff/editscript.rb +148 -0
- data/lib/docdiff/diff/rcsdiff.rb +107 -0
- data/lib/docdiff/diff/shortestpath.rb +93 -0
- data/lib/docdiff/diff/speculative.rb +40 -0
- data/lib/docdiff/diff/subsequence.rb +39 -0
- data/lib/docdiff/diff/unidiff.rb +124 -0
- data/lib/docdiff/difference.rb +92 -0
- data/lib/docdiff/document.rb +127 -0
- data/lib/docdiff/encoding/en_ascii.rb +97 -0
- data/lib/docdiff/encoding/ja_eucjp.rb +269 -0
- data/lib/docdiff/encoding/ja_sjis.rb +260 -0
- data/lib/docdiff/encoding/ja_utf8.rb +6974 -0
- data/lib/docdiff/version.rb +3 -0
- data/lib/docdiff/view.rb +476 -0
- data/lib/viewdiff.rb +375 -0
- data/readme.html +713 -0
- data/sample/01.en.ascii.cr +1 -0
- data/sample/01.en.ascii.crlf +2 -0
- data/sample/01.en.ascii.lf +2 -0
- data/sample/01.ja.eucjp.lf +2 -0
- data/sample/01.ja.sjis.cr +1 -0
- data/sample/01.ja.sjis.crlf +2 -0
- data/sample/01.ja.utf8.crlf +2 -0
- data/sample/02.en.ascii.cr +1 -0
- data/sample/02.en.ascii.crlf +2 -0
- data/sample/02.en.ascii.lf +2 -0
- data/sample/02.ja.eucjp.lf +2 -0
- data/sample/02.ja.sjis.cr +1 -0
- data/sample/02.ja.sjis.crlf +2 -0
- data/sample/02.ja.utf8.crlf +2 -0
- data/sample/humpty_dumpty01.ascii.lf +4 -0
- data/sample/humpty_dumpty02.ascii.lf +4 -0
- data/test/charstring_test.rb +1008 -0
- data/test/diff_test.rb +36 -0
- data/test/difference_test.rb +64 -0
- data/test/docdiff_test.rb +193 -0
- data/test/document_test.rb +626 -0
- data/test/test_helper.rb +7 -0
- data/test/view_test.rb +570 -0
- data/test/viewdiff_test.rb +908 -0
- metadata +129 -0
@@ -0,0 +1,92 @@
|
|
1
|
+
# Difference class for DocDiff
|
2
|
+
# 2003-03-24 ..
|
3
|
+
# Hisashi MORITA
|
4
|
+
|
5
|
+
require 'docdiff/diff'
|
6
|
+
|
7
|
+
class Difference < Array
|
8
|
+
|
9
|
+
# @resolution = nil # char, word, phrase, sentence, line, paragraph..
|
10
|
+
# @codeset = ''
|
11
|
+
# @eol_char = "\n"
|
12
|
+
# @source = 'source'
|
13
|
+
# @target = 'target'
|
14
|
+
# attr_accessor :resolution, :codeset, :eol_char, :source, :target
|
15
|
+
|
16
|
+
def initialize(array1 = nil, array2 = nil)
|
17
|
+
if (array1 == nil) && (array2 == nil)
|
18
|
+
return []
|
19
|
+
end
|
20
|
+
diff = Diff.new(array1, array2)
|
21
|
+
@raw_list = []
|
22
|
+
diff.ses.each{|block| # Diff::EditScript does not have each_with_index()
|
23
|
+
@raw_list << block
|
24
|
+
}
|
25
|
+
combine_del_add_to_change!()
|
26
|
+
end
|
27
|
+
|
28
|
+
def combine_del_add_to_change!()
|
29
|
+
|
30
|
+
@raw_list.each_with_index{|block, i|
|
31
|
+
case block.first
|
32
|
+
when :common_elt_elt
|
33
|
+
if i == 0 # first block
|
34
|
+
self << block
|
35
|
+
else # in-between or the last block
|
36
|
+
if @raw_list[i - 1].first == :del_elt # previous block was del
|
37
|
+
self << @raw_list[i - 1]
|
38
|
+
self << block
|
39
|
+
else # previous block was add
|
40
|
+
self << block
|
41
|
+
end
|
42
|
+
end
|
43
|
+
when :del_elt
|
44
|
+
if i == (@raw_list.size - 1) # last block
|
45
|
+
self << block
|
46
|
+
else # first block or in-between
|
47
|
+
# do nothing, let the next block to decide what to do
|
48
|
+
end
|
49
|
+
when :add_elt
|
50
|
+
if i == 0 # first block
|
51
|
+
self << block
|
52
|
+
else # in-between or the last block
|
53
|
+
if @raw_list[i - 1].first == :del_elt # previous block was del
|
54
|
+
deleted = @raw_list[i - 1][1]
|
55
|
+
added = @raw_list[i][2]
|
56
|
+
self << [:change_elt, deleted, added]
|
57
|
+
else # previous block was common
|
58
|
+
self << block
|
59
|
+
end
|
60
|
+
end
|
61
|
+
else
|
62
|
+
raise "the first element of the block #{i} is invalid: (#{block.first})\n"
|
63
|
+
end
|
64
|
+
}
|
65
|
+
end
|
66
|
+
attr_accessor :raw_list
|
67
|
+
|
68
|
+
def former_only()
|
69
|
+
elms = self.dup.delete_if{|e| e[0] == :add_elt}
|
70
|
+
elms.collect!{|e|
|
71
|
+
if e[0] == :change_elt
|
72
|
+
[e[0], e[1], nil]
|
73
|
+
else
|
74
|
+
e
|
75
|
+
end
|
76
|
+
}
|
77
|
+
return elms
|
78
|
+
end
|
79
|
+
|
80
|
+
def latter_only()
|
81
|
+
elms = self.dup.delete_if{|e| e[0] == :del_elt}
|
82
|
+
elms.collect!{|e|
|
83
|
+
if e[0] == :change_elt
|
84
|
+
[e[0], nil, e[2]]
|
85
|
+
else
|
86
|
+
e
|
87
|
+
end
|
88
|
+
}
|
89
|
+
return elms
|
90
|
+
end
|
91
|
+
|
92
|
+
end # class Difference
|
@@ -0,0 +1,127 @@
|
|
1
|
+
# Document class, a part of DocDiff
|
2
|
+
# 2004-01-14.. Hisashi MORITA
|
3
|
+
|
4
|
+
require 'docdiff/charstring'
|
5
|
+
|
6
|
+
class EncodingDetectionFailure < Exception
|
7
|
+
end
|
8
|
+
class EOLDetectionFailure < Exception
|
9
|
+
end
|
10
|
+
|
11
|
+
class Document
|
12
|
+
|
13
|
+
def initialize(str, enc = nil, e = nil)
|
14
|
+
@body = str
|
15
|
+
@body.extend CharString
|
16
|
+
if enc
|
17
|
+
@body.encoding = enc
|
18
|
+
elsif !@body.encoding
|
19
|
+
guessed_encoding = CharString.guess_encoding(str)
|
20
|
+
if guessed_encoding == "UNKNOWN"
|
21
|
+
raise EncodingDetectionFailure, "encoding not specified, and auto detection failed."
|
22
|
+
# @body.encoding = 'ASCII' # default to ASCII <= BAD!
|
23
|
+
else
|
24
|
+
@body.encoding = guessed_encoding
|
25
|
+
end
|
26
|
+
end
|
27
|
+
if e
|
28
|
+
@body.eol = e
|
29
|
+
else
|
30
|
+
guessed_eol = CharString.guess_eol(str)
|
31
|
+
if guessed_eol == "UNKNOWN"
|
32
|
+
raise EOLDetectionFailure, "eol not specified, and auto detection failed."
|
33
|
+
# @body.eol = 'LF' # default to LF
|
34
|
+
else
|
35
|
+
@body.eol = guessed_eol
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
def encoding()
|
40
|
+
@body.encoding
|
41
|
+
end
|
42
|
+
def encoding=(cs)
|
43
|
+
@body.encoding = cs
|
44
|
+
end
|
45
|
+
def eol()
|
46
|
+
@body.eol
|
47
|
+
end
|
48
|
+
def eol=(eolstr)
|
49
|
+
@body.eol = eolstr
|
50
|
+
end
|
51
|
+
|
52
|
+
def split_to_line()
|
53
|
+
@body.split_to_line
|
54
|
+
end
|
55
|
+
def split_to_word()
|
56
|
+
@body.split_to_word
|
57
|
+
end
|
58
|
+
def split_to_char()
|
59
|
+
@body.split_to_char
|
60
|
+
end
|
61
|
+
def split_to_byte()
|
62
|
+
@body.split_to_byte
|
63
|
+
end
|
64
|
+
|
65
|
+
def count_line()
|
66
|
+
@body.count_line
|
67
|
+
end
|
68
|
+
def count_blank_line()
|
69
|
+
@body.count_blank_line
|
70
|
+
end
|
71
|
+
def count_empty_line()
|
72
|
+
@body.count_empty_line
|
73
|
+
end
|
74
|
+
def count_graph_line()
|
75
|
+
@body.count_graph_line
|
76
|
+
end
|
77
|
+
|
78
|
+
def count_word()
|
79
|
+
@body.count_word
|
80
|
+
end
|
81
|
+
def count_latin_word()
|
82
|
+
@body.count_latin_word
|
83
|
+
end
|
84
|
+
def count_ja_word()
|
85
|
+
@body.count_ja_word
|
86
|
+
end
|
87
|
+
def count_valid_word()
|
88
|
+
@body.count_valid_word
|
89
|
+
end
|
90
|
+
def count_latin_valid_word()
|
91
|
+
@body.count_latin_valid_word
|
92
|
+
end
|
93
|
+
def count_ja_valid_word()
|
94
|
+
@body.count_ja_valid_word
|
95
|
+
end
|
96
|
+
|
97
|
+
def count_char()
|
98
|
+
@body.count_char
|
99
|
+
end
|
100
|
+
def count_blank_char()
|
101
|
+
@body.count_blank_char
|
102
|
+
end
|
103
|
+
def count_graph_char()
|
104
|
+
@body.count_graph_char
|
105
|
+
end
|
106
|
+
def count_latin_blank_char()
|
107
|
+
@body.count_latin_blank_char
|
108
|
+
end
|
109
|
+
def count_latin_graph_char()
|
110
|
+
@body.count_latin_graph_char
|
111
|
+
end
|
112
|
+
def count_ja_blank_char()
|
113
|
+
@body.count_ja_blank_char
|
114
|
+
end
|
115
|
+
def count_ja_graph_char()
|
116
|
+
@body.count_ja_graph_char
|
117
|
+
end
|
118
|
+
|
119
|
+
def count_byte()
|
120
|
+
@body.count_byte
|
121
|
+
end
|
122
|
+
|
123
|
+
def eol_char()
|
124
|
+
@body.eol_char
|
125
|
+
end
|
126
|
+
|
127
|
+
end # class Document
|
@@ -0,0 +1,97 @@
|
|
1
|
+
# English ASCII encoding module for CharString
|
2
|
+
# 2003- Hisashi MORITA
|
3
|
+
|
4
|
+
module CharString
|
5
|
+
module ASCII
|
6
|
+
|
7
|
+
Encoding = "US-ASCII"
|
8
|
+
|
9
|
+
CNTRL = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09" \
|
10
|
+
"\x0a\x0b\x0c\x0d\x0e\x0f\x10\x11\x12\x13" \
|
11
|
+
"\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d" \
|
12
|
+
"\x1e\x1f\x7f"
|
13
|
+
SPACE = "\x09\x0a\x0b\x0c\x0d\x20"
|
14
|
+
BLANK = "\x09\x20"
|
15
|
+
DIGIT = "\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39"
|
16
|
+
ALPHA = "\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4a" \
|
17
|
+
"\x4b\x4c\x4d\x4e\x4f\x50\x51\x52\x53\x54" \
|
18
|
+
"\x55\x56\x57\x58\x59\x5a\x61\x62\x63\x64" \
|
19
|
+
"\x65\x66\x67\x68\x69\x6a\x6b\x6c\x6d\x6e" \
|
20
|
+
"\x6f\x70\x71\x72\x73\x74\x75\x76\x77\x78" \
|
21
|
+
"\x79\x7a"
|
22
|
+
ALNUM = "\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39" \
|
23
|
+
"\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4a" \
|
24
|
+
"\x4b\x4c\x4d\x4e\x4f\x50\x51\x52\x53\x54" \
|
25
|
+
"\x55\x56\x57\x58\x59\x5a\x61\x62\x63\x64" \
|
26
|
+
"\x65\x66\x67\x68\x69\x6a\x6b\x6c\x6d\x6e" \
|
27
|
+
"\x6f\x70\x71\x72\x73\x74\x75\x76\x77\x78" \
|
28
|
+
"\x79\x7a"
|
29
|
+
PUNCT = "\x21\x22\x23\x24\x25\x26\x27\x28\x29\x2a" \
|
30
|
+
"\x2b\x2c\x2d\x2e\x2f\x3a\x3b\x3c\x3d\x3e" \
|
31
|
+
"\x3f\x40\x5b\x5c\x5d\x5e\x5f\x60\x7b\x7c" \
|
32
|
+
"\x7d\x7e"
|
33
|
+
LOWER = "\x61\x62\x63\x64\x65\x66\x67\x68\x69\x6a" \
|
34
|
+
"\x6b\x6c\x6d\x6e\x6f\x70\x71\x72\x73\x74" \
|
35
|
+
"\x75\x76\x77\x78\x79\x7a"
|
36
|
+
UPPER = "\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4a" \
|
37
|
+
"\x4b\x4c\x4d\x4e\x4f\x50\x51\x52\x53\x54" \
|
38
|
+
"\x55\x56\x57\x58\x59\x5a"
|
39
|
+
PRINT = "\x20\x21\x22\x23\x24\x25\x26\x27\x28\x29" \
|
40
|
+
"\x2a\x2b\x2c\x2d\x2e\x2f\x30\x31\x32\x33" \
|
41
|
+
"\x34\x35\x36\x37\x38\x39\x3a\x3b\x3c\x3d" \
|
42
|
+
"\x3e\x3f\x40\x41\x42\x43\x44\x45\x46\x47" \
|
43
|
+
"\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f\x50\x51" \
|
44
|
+
"\x52\x53\x54\x55\x56\x57\x58\x59\x5a\x5b" \
|
45
|
+
"\x5c\x5d\x5e\x5f\x60\x61\x62\x63\x64\x65" \
|
46
|
+
"\x66\x67\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f" \
|
47
|
+
"\x70\x71\x72\x73\x74\x75\x76\x77\x78\x79" \
|
48
|
+
"\x7a\x7b\x7c\x7d\x7e"
|
49
|
+
GRAPH = "\x21\x22\x23\x24\x25\x26\x27\x28\x29\x2a" \
|
50
|
+
"\x2b\x2c\x2d\x2e\x2f\x30\x31\x32\x33\x34" \
|
51
|
+
"\x35\x36\x37\x38\x39\x3a\x3b\x3c\x3d\x3e" \
|
52
|
+
"\x3f\x40\x41\x42\x43\x44\x45\x46\x47\x48" \
|
53
|
+
"\x49\x4a\x4b\x4c\x4d\x4e\x4f\x50\x51\x52" \
|
54
|
+
"\x53\x54\x55\x56\x57\x58\x59\x5a\x5b\x5c" \
|
55
|
+
"\x5d\x5e\x5f\x60\x61\x62\x63\x64\x65\x66" \
|
56
|
+
"\x67\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f\x70" \
|
57
|
+
"\x71\x72\x73\x74\x75\x76\x77\x78\x79\x7a" \
|
58
|
+
"\x7b\x7c\x7d\x7e"
|
59
|
+
XDIGIT = "\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39" \
|
60
|
+
"\x41\x42\x43\x44\x45\x46\x61\x62\x63\x64" \
|
61
|
+
"\x65\x66"
|
62
|
+
|
63
|
+
JA_BLANK = "" # kludge...
|
64
|
+
JA_GRAPH = "" # kludge...
|
65
|
+
|
66
|
+
PUNCT.replace(Regexp.quote(PUNCT)) # kludge to avoid warning "character class has `[' without escape"
|
67
|
+
PRINT.replace(Regexp.quote(PRINT)) # kludge to avoid warning "character class has `[' without escape"
|
68
|
+
GRAPH.replace(Regexp.quote(GRAPH)) # kludge to avoid warning "character class has `[' without escape"
|
69
|
+
|
70
|
+
WORD_REGEXP_SRC = ["(?:[#{GRAPH}]+[#{BLANK}]?)",
|
71
|
+
"|(?:[#{SPACE}]+)",
|
72
|
+
"|(?:.+?)"].join
|
73
|
+
|
74
|
+
# override default method, as ASCII has no Japanese in it
|
75
|
+
def count_ja_graph_char()
|
76
|
+
0
|
77
|
+
end
|
78
|
+
|
79
|
+
# override default method, as ASCII has no Japanese in it
|
80
|
+
def count_ja_blank_char()
|
81
|
+
0
|
82
|
+
end
|
83
|
+
|
84
|
+
# override default method, as ASCII has no Japanese in it
|
85
|
+
def count_ja_word()
|
86
|
+
0
|
87
|
+
end
|
88
|
+
|
89
|
+
# override default method, as ASCII has no Japanese in it
|
90
|
+
def count_ja_valid_word()
|
91
|
+
0
|
92
|
+
end
|
93
|
+
|
94
|
+
CharString.register_encoding(self)
|
95
|
+
|
96
|
+
end # module ASCII
|
97
|
+
end
|
@@ -0,0 +1,269 @@
|
|
1
|
+
# Japanese EUC-JP encoding module for CharString
|
2
|
+
# 2003- Hisashi MORITA
|
3
|
+
|
4
|
+
module CharString
|
5
|
+
module EUC_JP
|
6
|
+
|
7
|
+
Encoding = "EUC-JP"
|
8
|
+
|
9
|
+
# character table based on:
|
10
|
+
# ftp://ftp.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/JIS0208.TXT
|
11
|
+
|
12
|
+
CNTRL = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09" \
|
13
|
+
"\x0a\x0b\x0c\x0d\x0e\x0f\x10\x11\x12\x13" \
|
14
|
+
"\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d" \
|
15
|
+
"\x1e\x1f\x7f"
|
16
|
+
SPACE = "\x09\x0a\x0b\x0c\x0d\x20"
|
17
|
+
BLANK = "\x09\x20"
|
18
|
+
DIGIT = "\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39"
|
19
|
+
ALPHA = "\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4a" \
|
20
|
+
"\x4b\x4c\x4d\x4e\x4f\x50\x51\x52\x53\x54" \
|
21
|
+
"\x55\x56\x57\x58\x59\x5a\x61\x62\x63\x64" \
|
22
|
+
"\x65\x66\x67\x68\x69\x6a\x6b\x6c\x6d\x6e" \
|
23
|
+
"\x6f\x70\x71\x72\x73\x74\x75\x76\x77\x78" \
|
24
|
+
"\x79\x7a"
|
25
|
+
ALNUM = "\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39" \
|
26
|
+
"\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4a" \
|
27
|
+
"\x4b\x4c\x4d\x4e\x4f\x50\x51\x52\x53\x54" \
|
28
|
+
"\x55\x56\x57\x58\x59\x5a\x61\x62\x63\x64" \
|
29
|
+
"\x65\x66\x67\x68\x69\x6a\x6b\x6c\x6d\x6e" \
|
30
|
+
"\x6f\x70\x71\x72\x73\x74\x75\x76\x77\x78" \
|
31
|
+
"\x79\x7a"
|
32
|
+
PUNCT = "\x21\x22\x23\x24\x25\x26\x27\x28\x29\x2a" \
|
33
|
+
"\x2b\x2c\x2d\x2e\x2f\x3a\x3b\x3c\x3d\x3e" \
|
34
|
+
"\x3f\x40\x5b\x5c\x5d\x5e\x5f\x60\x7b\x7c" \
|
35
|
+
"\x7d\x7e"
|
36
|
+
LOWER = "\x61\x62\x63\x64\x65\x66\x67\x68\x69\x6a" \
|
37
|
+
"\x6b\x6c\x6d\x6e\x6f\x70\x71\x72\x73\x74" \
|
38
|
+
"\x75\x76\x77\x78\x79\x7a"
|
39
|
+
UPPER = "\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4a" \
|
40
|
+
"\x4b\x4c\x4d\x4e\x4f\x50\x51\x52\x53\x54" \
|
41
|
+
"\x55\x56\x57\x58\x59\x5a"
|
42
|
+
PRINT = "\x20\x21\x22\x23\x24\x25\x26\x27\x28\x29" \
|
43
|
+
"\x2a\x2b\x2c\x2d\x2e\x2f\x30\x31\x32\x33" \
|
44
|
+
"\x34\x35\x36\x37\x38\x39\x3a\x3b\x3c\x3d" \
|
45
|
+
"\x3e\x3f\x40\x41\x42\x43\x44\x45\x46\x47" \
|
46
|
+
"\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f\x50\x51" \
|
47
|
+
"\x52\x53\x54\x55\x56\x57\x58\x59\x5a\x5b" \
|
48
|
+
"\x5c\x5d\x5e\x5f\x60\x61\x62\x63\x64\x65" \
|
49
|
+
"\x66\x67\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f" \
|
50
|
+
"\x70\x71\x72\x73\x74\x75\x76\x77\x78\x79" \
|
51
|
+
"\x7a\x7b\x7c\x7d\x7e"
|
52
|
+
GRAPH = "\x21\x22\x23\x24\x25\x26\x27\x28\x29\x2a" \
|
53
|
+
"\x2b\x2c\x2d\x2e\x2f\x30\x31\x32\x33\x34" \
|
54
|
+
"\x35\x36\x37\x38\x39\x3a\x3b\x3c\x3d\x3e" \
|
55
|
+
"\x3f\x40\x41\x42\x43\x44\x45\x46\x47\x48" \
|
56
|
+
"\x49\x4a\x4b\x4c\x4d\x4e\x4f\x50\x51\x52" \
|
57
|
+
"\x53\x54\x55\x56\x57\x58\x59\x5a\x5b\x5c" \
|
58
|
+
"\x5d\x5e\x5f\x60\x61\x62\x63\x64\x65\x66" \
|
59
|
+
"\x67\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f\x70" \
|
60
|
+
"\x71\x72\x73\x74\x75\x76\x77\x78\x79\x7a" \
|
61
|
+
"\x7b\x7c\x7d\x7e"
|
62
|
+
XDIGIT = "\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39" \
|
63
|
+
"\x41\x42\x43\x44\x45\x46\x61\x62\x63\x64" \
|
64
|
+
"\x65\x66"
|
65
|
+
JA_SPACE = "\xa1\xa1"
|
66
|
+
JA_BLANK = "\xa1\xa1"
|
67
|
+
JA_ALNUM = "\xa3\xb0\xa3\xb1\xa3\xb2\xa3\xb3\xa3\xb4" \
|
68
|
+
"\xa3\xb5\xa3\xb6\xa3\xb7\xa3\xb8\xa3\xb9" \
|
69
|
+
"\xa3\xc1\xa3\xc2\xa3\xc3\xa3\xc4\xa3\xc5" \
|
70
|
+
"\xa3\xc6\xa3\xc7\xa3\xc8\xa3\xc9\xa3\xca" \
|
71
|
+
"\xa3\xcb\xa3\xcc\xa3\xcd\xa3\xce\xa3\xcf" \
|
72
|
+
"\xa3\xd0\xa3\xd1\xa3\xd2\xa3\xd3\xa3\xd4" \
|
73
|
+
"\xa3\xd5\xa3\xd6\xa3\xd7\xa3\xd8\xa3\xd9" \
|
74
|
+
"\xa3\xda\xa3\xe1\xa3\xe2\xa3\xe3\xa3\xe4" \
|
75
|
+
"\xa3\xe5\xa3\xe6\xa3\xe7\xa3\xe8\xa3\xe9" \
|
76
|
+
"\xa3\xea\xa3\xeb\xa3\xec\xa3\xed\xa3\xee" \
|
77
|
+
"\xa3\xef\xa3\xf0\xa3\xf1\xa3\xf2\xa3\xf3" \
|
78
|
+
"\xa3\xf4\xa3\xf5\xa3\xf6\xa3\xf7\xa3\xf8" \
|
79
|
+
"\xa3\xf9\xa3\xfa"
|
80
|
+
JA_PUNCT = "\xa1\xa2\xa1\xa3\xa1\xa4\xa1\xa5\xa1\xa6" \
|
81
|
+
"\xa1\xa7\xa1\xa8\xa1\xa9\xa1\xaa\xa1\xab" \
|
82
|
+
"\xa1\xac\xa1\xad\xa1\xae\xa1\xaf\xa1\xb0" \
|
83
|
+
"\xa1\xb1\xa1\xb2\xa1\xb3\xa1\xb4\xa1\xb5" \
|
84
|
+
"\xa1\xb6\xa1\xb7\xa1\xb8\xa1\xb9\xa1\xba" \
|
85
|
+
"\xa1\xbb\xa1\xbc\xa1\xbd\xa1\xbe\xa1\xbf" \
|
86
|
+
"\xa1\xc0\xa1\xc1\xa1\xc2\xa1\xc3\xa1\xc4" \
|
87
|
+
"\xa1\xc5\xa1\xc6\xa1\xc7\xa1\xc8\xa1\xc9" \
|
88
|
+
"\xa1\xca\xa1\xcb\xa1\xcc\xa1\xcd\xa1\xce" \
|
89
|
+
"\xa1\xcf\xa1\xd0\xa1\xd1\xa1\xd2\xa1\xd3" \
|
90
|
+
"\xa1\xd4\xa1\xd5\xa1\xd6\xa1\xd7\xa1\xd8" \
|
91
|
+
"\xa1\xd9\xa1\xda\xa1\xdb\xa1\xdc\xa1\xdd" \
|
92
|
+
"\xa1\xde\xa1\xdf\xa1\xe0\xa1\xe1\xa1\xe2" \
|
93
|
+
"\xa1\xe3\xa1\xe4\xa1\xe5\xa1\xe6\xa1\xe7" \
|
94
|
+
"\xa1\xe8\xa1\xe9\xa1\xea\xa1\xeb\xa1\xec" \
|
95
|
+
"\xa1\xed\xa1\xee\xa1\xef\xa1\xf0\xa1\xf1" \
|
96
|
+
"\xa1\xf2\xa1\xf3\xa1\xf4\xa1\xf5\xa1\xf6" \
|
97
|
+
"\xa1\xf7\xa1\xf8\xa1\xf9\xa1\xfa\xa1\xfb" \
|
98
|
+
"\xa1\xfc\xa1\xfd\xa1\xfe\xa2\xa1\xa2\xa2" \
|
99
|
+
"\xa2\xa3\xa2\xa4\xa2\xa5\xa2\xa6\xa2\xa7" \
|
100
|
+
"\xa2\xa8\xa2\xa9\xa2\xaa\xa2\xab\xa2\xac" \
|
101
|
+
"\xa2\xad\xa2\xae\xa2\xba\xa2\xbb\xa2\xbc" \
|
102
|
+
"\xa2\xbd\xa2\xbe\xa2\xbf\xa2\xc0\xa2\xc1" \
|
103
|
+
"\xa2\xca\xa2\xcb\xa2\xcc\xa2\xcd\xa2\xce" \
|
104
|
+
"\xa2\xcf\xa2\xd0\xa2\xdc\xa2\xdd\xa2\xde" \
|
105
|
+
"\xa2\xdf\xa2\xe0\xa2\xe1\xa2\xe2\xa2\xe3" \
|
106
|
+
"\xa2\xe4\xa2\xe5\xa2\xe6\xa2\xe7\xa2\xe8" \
|
107
|
+
"\xa2\xe9\xa2\xea\xa2\xf2\xa2\xf3\xa2\xf4" \
|
108
|
+
"\xa2\xf5\xa2\xf6\xa2\xf7\xa2\xf8\xa2\xf9" \
|
109
|
+
"\xa2\xfe\xa6\xa1\xa6\xa2\xa6\xa3\xa6\xa4" \
|
110
|
+
"\xa6\xa5\xa6\xa6\xa6\xa7\xa6\xa8\xa6\xa9" \
|
111
|
+
"\xa6\xaa\xa6\xab\xa6\xac\xa6\xad\xa6\xae" \
|
112
|
+
"\xa6\xaf\xa6\xb0\xa6\xb1\xa6\xb2\xa6\xb3" \
|
113
|
+
"\xa6\xb4\xa6\xb5\xa6\xb6\xa6\xb7\xa6\xb8" \
|
114
|
+
"\xa6\xc1\xa6\xc2\xa6\xc3\xa6\xc4\xa6\xc5" \
|
115
|
+
"\xa6\xc6\xa6\xc7\xa6\xc8\xa6\xc9\xa6\xca" \
|
116
|
+
"\xa6\xcb\xa6\xcc\xa6\xcd\xa6\xce\xa6\xcf" \
|
117
|
+
"\xa6\xd0\xa6\xd1\xa6\xd2\xa6\xd3\xa6\xd4" \
|
118
|
+
"\xa6\xd5\xa6\xd6\xa6\xd7\xa6\xd8\xa7\xa1" \
|
119
|
+
"\xa7\xa2\xa7\xa3\xa7\xa4\xa7\xa5\xa7\xa6" \
|
120
|
+
"\xa7\xa7\xa7\xa8\xa7\xa9\xa7\xaa\xa7\xab" \
|
121
|
+
"\xa7\xac\xa7\xad\xa7\xae\xa7\xaf\xa7\xb0" \
|
122
|
+
"\xa7\xb1\xa7\xb2\xa7\xb3\xa7\xb4\xa7\xb5" \
|
123
|
+
"\xa7\xb6\xa7\xb7\xa7\xb8\xa7\xb9\xa7\xba" \
|
124
|
+
"\xa7\xbb\xa7\xbc\xa7\xbd\xa7\xbe\xa7\xbf" \
|
125
|
+
"\xa7\xc0\xa7\xc1\xa7\xd1\xa7\xd2\xa7\xd3" \
|
126
|
+
"\xa7\xd4\xa7\xd5\xa7\xd6\xa7\xd7\xa7\xd8" \
|
127
|
+
"\xa7\xd9\xa7\xda\xa7\xdb\xa7\xdc\xa7\xdd" \
|
128
|
+
"\xa7\xde\xa7\xdf\xa7\xe0\xa7\xe1\xa7\xe2" \
|
129
|
+
"\xa7\xe3\xa7\xe4\xa7\xe5\xa7\xe6\xa7\xe7" \
|
130
|
+
"\xa7\xe8\xa7\xe9\xa7\xea\xa7\xeb\xa7\xec" \
|
131
|
+
"\xa7\xed\xa7\xee\xa7\xef\xa7\xf0\xa7\xf1" \
|
132
|
+
"\xa8\xa1\xa8\xa2\xa8\xa3\xa8\xa4\xa8\xa5" \
|
133
|
+
"\xa8\xa6\xa8\xa7\xa8\xa8\xa8\xa9\xa8\xaa" \
|
134
|
+
"\xa8\xab\xa8\xac\xa8\xad\xa8\xae\xa8\xaf" \
|
135
|
+
"\xa8\xb0\xa8\xb1\xa8\xb2\xa8\xb3\xa8\xb4" \
|
136
|
+
"\xa8\xb5\xa8\xb6\xa8\xb7\xa8\xb8\xa8\xb9" \
|
137
|
+
"\xa8\xba\xa8\xbb\xa8\xbc\xa8\xbd\xa8\xbe" \
|
138
|
+
"\xa8\xbf\xa8\xc0"
|
139
|
+
HIRA = "\xa4\xa1\xa4\xa2\xa4\xa3\xa4\xa4\xa4\xa5" \
|
140
|
+
"\xa4\xa6\xa4\xa7\xa4\xa8\xa4\xa9\xa4\xaa" \
|
141
|
+
"\xa4\xab\xa4\xac\xa4\xad\xa4\xae\xa4\xaf" \
|
142
|
+
"\xa4\xb0\xa4\xb1\xa4\xb2\xa4\xb3\xa4\xb4" \
|
143
|
+
"\xa4\xb5\xa4\xb6\xa4\xb7\xa4\xb8\xa4\xb9" \
|
144
|
+
"\xa4\xba\xa4\xbb\xa4\xbc\xa4\xbd\xa4\xbe" \
|
145
|
+
"\xa4\xbf\xa4\xc0\xa4\xc1\xa4\xc2\xa4\xc3" \
|
146
|
+
"\xa4\xc4\xa4\xc5\xa4\xc6\xa4\xc7\xa4\xc8" \
|
147
|
+
"\xa4\xc9\xa4\xca\xa4\xcb\xa4\xcc\xa4\xcd" \
|
148
|
+
"\xa4\xce\xa4\xcf\xa4\xd0\xa4\xd1\xa4\xd2" \
|
149
|
+
"\xa4\xd3\xa4\xd4\xa4\xd5\xa4\xd6\xa4\xd7" \
|
150
|
+
"\xa4\xd8\xa4\xd9\xa4\xda\xa4\xdb\xa4\xdc" \
|
151
|
+
"\xa4\xdd\xa4\xde\xa4\xdf\xa4\xe0\xa4\xe1" \
|
152
|
+
"\xa4\xe2\xa4\xe3\xa4\xe4\xa4\xe5\xa4\xe6" \
|
153
|
+
"\xa4\xe7\xa4\xe8\xa4\xe9\xa4\xea\xa4\xeb" \
|
154
|
+
"\xa4\xec\xa4\xed\xa4\xee\xa4\xef\xa4\xf0" \
|
155
|
+
"\xa4\xf1\xa4\xf2\xa4\xf3" # [\xa4\xa1-\xa4\xf3]
|
156
|
+
HIRA_EX = HIRA +
|
157
|
+
"\xa1\xbc" +
|
158
|
+
"\xa1\xb3\xa1\xb4" # add onbiki and kanagaeshi(hira)
|
159
|
+
KATA = "\xa5\xa1\xa5\xa2\xa5\xa3\xa5\xa4\xa5\xa5" \
|
160
|
+
"\xa5\xa6\xa5\xa7\xa5\xa8\xa5\xa9\xa5\xaa" \
|
161
|
+
"\xa5\xab\xa5\xac\xa5\xad\xa5\xae\xa5\xaf" \
|
162
|
+
"\xa5\xb0\xa5\xb1\xa5\xb2\xa5\xb3\xa5\xb4" \
|
163
|
+
"\xa5\xb5\xa5\xb6\xa5\xb7\xa5\xb8\xa5\xb9" \
|
164
|
+
"\xa5\xba\xa5\xbb\xa5\xbc\xa5\xbd\xa5\xbe" \
|
165
|
+
"\xa5\xbf\xa5\xc0\xa5\xc1\xa5\xc2\xa5\xc3" \
|
166
|
+
"\xa5\xc4\xa5\xc5\xa5\xc6\xa5\xc7\xa5\xc8" \
|
167
|
+
"\xa5\xc9\xa5\xca\xa5\xcb\xa5\xcc\xa5\xcd" \
|
168
|
+
"\xa5\xce\xa5\xcf\xa5\xd0\xa5\xd1\xa5\xd2" \
|
169
|
+
"\xa5\xd3\xa5\xd4\xa5\xd5\xa5\xd6\xa5\xd7" \
|
170
|
+
"\xa5\xd8\xa5\xd9\xa5\xda\xa5\xdb\xa5\xdc" \
|
171
|
+
"\xa5\xdd\xa5\xde\xa5\xdf\xa5\xe0\xa5\xe1" \
|
172
|
+
"\xa5\xe2\xa5\xe3\xa5\xe4\xa5\xe5\xa5\xe6" \
|
173
|
+
"\xa5\xe7\xa5\xe8\xa5\xe9\xa5\xea\xa5\xeb" \
|
174
|
+
"\xa5\xec\xa5\xed\xa5\xee\xa5\xef\xa5\xf0" \
|
175
|
+
"\xa5\xf1\xa5\xf2\xa5\xf3\xa5\xf4\xa5\xf5" \
|
176
|
+
"\xa5\xf6" # [\xa5\xa1-\xa5\xf6]
|
177
|
+
KATA_EX = KATA +
|
178
|
+
"\xa1\xbc" +
|
179
|
+
"\xa1\xb5\xa1\xb6" # add onbiki and kanagaeshi(kata)
|
180
|
+
KANJI = "\xb0\xa1-\xb0\xfe" \
|
181
|
+
"\xb1\xa1-\xb1\xfe" \
|
182
|
+
"\xb2\xa1-\xb2\xfe" \
|
183
|
+
"\xb3\xa1-\xb3\xfe" \
|
184
|
+
"\xb4\xa1-\xb4\xfe" \
|
185
|
+
"\xb5\xa1-\xb5\xfe" \
|
186
|
+
"\xb6\xa1-\xb6\xfe" \
|
187
|
+
"\xb7\xa1-\xb7\xfe" \
|
188
|
+
"\xb8\xa1-\xb8\xfe" \
|
189
|
+
"\xb9\xa1-\xb9\xfe" \
|
190
|
+
"\xba\xa1-\xba\xfe" \
|
191
|
+
"\xbb\xa1-\xbb\xfe" \
|
192
|
+
"\xbc\xa1-\xbc\xfe" \
|
193
|
+
"\xbd\xa1-\xbd\xfe" \
|
194
|
+
"\xbe\xa1-\xbe\xfe" \
|
195
|
+
"\xbf\xa1-\xbf\xfe" \
|
196
|
+
"\xc0\xa1-\xc0\xfe" \
|
197
|
+
"\xc1\xa1-\xc1\xfe" \
|
198
|
+
"\xc2\xa1-\xc2\xfe" \
|
199
|
+
"\xc3\xa1-\xc3\xfe" \
|
200
|
+
"\xc4\xa1-\xc4\xfe" \
|
201
|
+
"\xc5\xa1-\xc5\xfe" \
|
202
|
+
"\xc6\xa1-\xc6\xfe" \
|
203
|
+
"\xc7\xa1-\xc7\xfe" \
|
204
|
+
"\xc8\xa1-\xc8\xfe" \
|
205
|
+
"\xc9\xa1-\xc9\xfe" \
|
206
|
+
"\xca\xa1-\xca\xfe" \
|
207
|
+
"\xcb\xa1-\xcb\xfe" \
|
208
|
+
"\xcc\xa1-\xcc\xfe" \
|
209
|
+
"\xcd\xa1-\xcd\xfe" \
|
210
|
+
"\xce\xa1-\xce\xfe" \
|
211
|
+
"\xcf\xa1-\xcf\xd3" \
|
212
|
+
"\xd0\xa1-\xd0\xfe" \
|
213
|
+
"\xd1\xa1-\xd1\xfe" \
|
214
|
+
"\xd2\xa1-\xd2\xfe" \
|
215
|
+
"\xd3\xa1-\xd3\xfe" \
|
216
|
+
"\xd4\xa1-\xd4\xfe" \
|
217
|
+
"\xd5\xa1-\xd5\xfe" \
|
218
|
+
"\xd6\xa1-\xd6\xfe" \
|
219
|
+
"\xd7\xa1-\xd7\xfe" \
|
220
|
+
"\xd8\xa1-\xd8\xfe" \
|
221
|
+
"\xd9\xa1-\xd9\xfe" \
|
222
|
+
"\xda\xa1-\xda\xfe" \
|
223
|
+
"\xdb\xa1-\xdb\xfe" \
|
224
|
+
"\xdc\xa1-\xdc\xfe" \
|
225
|
+
"\xdd\xa1-\xdd\xfe" \
|
226
|
+
"\xde\xa1-\xde\xfe" \
|
227
|
+
"\xdf\xa1-\xdf\xfe" \
|
228
|
+
"\xe0\xa1-\xe0\xfe" \
|
229
|
+
"\xe1\xa1-\xe1\xfe" \
|
230
|
+
"\xe2\xa1-\xe2\xfe" \
|
231
|
+
"\xe3\xa1-\xe3\xfe" \
|
232
|
+
"\xe4\xa1-\xe4\xfe" \
|
233
|
+
"\xe5\xa1-\xe5\xfe" \
|
234
|
+
"\xe6\xa1-\xe6\xfe" \
|
235
|
+
"\xe7\xa1-\xe7\xfe" \
|
236
|
+
"\xe8\xa1-\xe8\xfe" \
|
237
|
+
"\xe9\xa1-\xe9\xfe" \
|
238
|
+
"\xea\xa1-\xea\xfe" \
|
239
|
+
"\xeb\xa1-\xeb\xfe" \
|
240
|
+
"\xec\xa1-\xec\xfe" \
|
241
|
+
"\xed\xa1-\xed\xfe" \
|
242
|
+
"\xee\xa1-\xee\xfe" \
|
243
|
+
"\xef\xa1-\xef\xfe" \
|
244
|
+
"\xf0\xa1-\xf0\xfe" \
|
245
|
+
"\xf1\xa1-\xf1\xfe" \
|
246
|
+
"\xf2\xa1-\xf2\xfe" \
|
247
|
+
"\xf3\xa1-\xf3\xfe" \
|
248
|
+
"\xf4\xa1-\xf4\xa6"
|
249
|
+
KANJI_EX = KANJI + "\xa1\xb9" # + noma
|
250
|
+
JA_GRAPH = JA_ALNUM + JA_PUNCT + HIRA + KATA + KANJI
|
251
|
+
JA_PRINT = JA_GRAPH + JA_BLANK
|
252
|
+
|
253
|
+
PUNCT.replace(Regexp.quote(PUNCT)) # kludge to avoid warning "character class has `[' without escape"
|
254
|
+
PRINT.replace(Regexp.quote(PRINT)) # kludge to avoid warning "character class has `[' without escape"
|
255
|
+
GRAPH.replace(Regexp.quote(GRAPH)) # kludge to avoid warning "character class has `[' without escape"
|
256
|
+
|
257
|
+
WORD_REGEXP_SRC = ["(?:[#{GRAPH}]+[#{BLANK}]?)",
|
258
|
+
"|(?:[#{SPACE}]+)",
|
259
|
+
"|(?:[#{KANJI_EX}]+[#{HIRA}]+)",
|
260
|
+
"|(?:[#{KATA_EX}]+[#{HIRA}]+)",
|
261
|
+
"|(?:[#{KANJI_EX}]+)",
|
262
|
+
"|(?:[#{KATA_EX}]+)",
|
263
|
+
"|(?:[#{HIRA_EX}]+)",
|
264
|
+
"|(?:.+?)"].join
|
265
|
+
|
266
|
+
CharString.register_encoding(self)
|
267
|
+
|
268
|
+
end # module EUCJP
|
269
|
+
end
|