docdiff 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +6 -0
- data/.travis.yml +7 -0
- data/Gemfile +17 -0
- data/Guardfile +8 -0
- data/Makefile +108 -0
- data/Rakefile +17 -0
- data/bin/docdiff +179 -0
- data/devutil/JIS0208.TXT +6952 -0
- data/devutil/char_by_charclass.rb +23 -0
- data/devutil/charclass_by_char.rb +21 -0
- data/devutil/jis0208.rb +343 -0
- data/devutil/testjis0208.rb +38 -0
- data/docdiff.conf.example +22 -0
- data/docdiff.gemspec +23 -0
- data/docdiffwebui.cgi +176 -0
- data/docdiffwebui.html +123 -0
- data/img/docdiff-screenshot-format-html-digest-firefox.png +0 -0
- data/img/docdiff-screenshot-format-html-firefox.png +0 -0
- data/img/docdiff-screenshot-format-tty-cmdexe-en.png +0 -0
- data/img/docdiff-screenshot-format-tty-cmdexe-ja.png +0 -0
- data/img/docdiff-screenshot-format-tty-rxvtunicode-en.png +0 -0
- data/img/docdiff-screenshot-format-tty-rxvtunicode-ja.png +0 -0
- data/img/docdiff-screenshot-format-tty-xterm-en.png +0 -0
- data/img/docdiff-screenshot-format-tty-xterm-ja.png +0 -0
- data/img/docdiff-screenshot-resolution-linewordchar-xterm.png +0 -0
- data/index.html +181 -0
- data/langfilter.rb +14 -0
- data/lib/doc_diff.rb +170 -0
- data/lib/docdiff.rb +7 -0
- data/lib/docdiff/charstring.rb +579 -0
- data/lib/docdiff/diff.rb +217 -0
- data/lib/docdiff/diff/contours.rb +382 -0
- data/lib/docdiff/diff/editscript.rb +148 -0
- data/lib/docdiff/diff/rcsdiff.rb +107 -0
- data/lib/docdiff/diff/shortestpath.rb +93 -0
- data/lib/docdiff/diff/speculative.rb +40 -0
- data/lib/docdiff/diff/subsequence.rb +39 -0
- data/lib/docdiff/diff/unidiff.rb +124 -0
- data/lib/docdiff/difference.rb +92 -0
- data/lib/docdiff/document.rb +127 -0
- data/lib/docdiff/encoding/en_ascii.rb +97 -0
- data/lib/docdiff/encoding/ja_eucjp.rb +269 -0
- data/lib/docdiff/encoding/ja_sjis.rb +260 -0
- data/lib/docdiff/encoding/ja_utf8.rb +6974 -0
- data/lib/docdiff/version.rb +3 -0
- data/lib/docdiff/view.rb +476 -0
- data/lib/viewdiff.rb +375 -0
- data/readme.html +713 -0
- data/sample/01.en.ascii.cr +1 -0
- data/sample/01.en.ascii.crlf +2 -0
- data/sample/01.en.ascii.lf +2 -0
- data/sample/01.ja.eucjp.lf +2 -0
- data/sample/01.ja.sjis.cr +1 -0
- data/sample/01.ja.sjis.crlf +2 -0
- data/sample/01.ja.utf8.crlf +2 -0
- data/sample/02.en.ascii.cr +1 -0
- data/sample/02.en.ascii.crlf +2 -0
- data/sample/02.en.ascii.lf +2 -0
- data/sample/02.ja.eucjp.lf +2 -0
- data/sample/02.ja.sjis.cr +1 -0
- data/sample/02.ja.sjis.crlf +2 -0
- data/sample/02.ja.utf8.crlf +2 -0
- data/sample/humpty_dumpty01.ascii.lf +4 -0
- data/sample/humpty_dumpty02.ascii.lf +4 -0
- data/test/charstring_test.rb +1008 -0
- data/test/diff_test.rb +36 -0
- data/test/difference_test.rb +64 -0
- data/test/docdiff_test.rb +193 -0
- data/test/document_test.rb +626 -0
- data/test/test_helper.rb +7 -0
- data/test/view_test.rb +570 -0
- data/test/viewdiff_test.rb +908 -0
- metadata +129 -0
@@ -0,0 +1,92 @@
|
|
1
|
+
# Difference class for DocDiff
|
2
|
+
# 2003-03-24 ..
|
3
|
+
# Hisashi MORITA
|
4
|
+
|
5
|
+
require 'docdiff/diff'
|
6
|
+
|
7
|
+
class Difference < Array
|
8
|
+
|
9
|
+
# @resolution = nil # char, word, phrase, sentence, line, paragraph..
|
10
|
+
# @codeset = ''
|
11
|
+
# @eol_char = "\n"
|
12
|
+
# @source = 'source'
|
13
|
+
# @target = 'target'
|
14
|
+
# attr_accessor :resolution, :codeset, :eol_char, :source, :target
|
15
|
+
|
16
|
+
def initialize(array1 = nil, array2 = nil)
|
17
|
+
if (array1 == nil) && (array2 == nil)
|
18
|
+
return []
|
19
|
+
end
|
20
|
+
diff = Diff.new(array1, array2)
|
21
|
+
@raw_list = []
|
22
|
+
diff.ses.each{|block| # Diff::EditScript does not have each_with_index()
|
23
|
+
@raw_list << block
|
24
|
+
}
|
25
|
+
combine_del_add_to_change!()
|
26
|
+
end
|
27
|
+
|
28
|
+
def combine_del_add_to_change!()
|
29
|
+
|
30
|
+
@raw_list.each_with_index{|block, i|
|
31
|
+
case block.first
|
32
|
+
when :common_elt_elt
|
33
|
+
if i == 0 # first block
|
34
|
+
self << block
|
35
|
+
else # in-between or the last block
|
36
|
+
if @raw_list[i - 1].first == :del_elt # previous block was del
|
37
|
+
self << @raw_list[i - 1]
|
38
|
+
self << block
|
39
|
+
else # previous block was add
|
40
|
+
self << block
|
41
|
+
end
|
42
|
+
end
|
43
|
+
when :del_elt
|
44
|
+
if i == (@raw_list.size - 1) # last block
|
45
|
+
self << block
|
46
|
+
else # first block or in-between
|
47
|
+
# do nothing, let the next block to decide what to do
|
48
|
+
end
|
49
|
+
when :add_elt
|
50
|
+
if i == 0 # first block
|
51
|
+
self << block
|
52
|
+
else # in-between or the last block
|
53
|
+
if @raw_list[i - 1].first == :del_elt # previous block was del
|
54
|
+
deleted = @raw_list[i - 1][1]
|
55
|
+
added = @raw_list[i][2]
|
56
|
+
self << [:change_elt, deleted, added]
|
57
|
+
else # previous block was common
|
58
|
+
self << block
|
59
|
+
end
|
60
|
+
end
|
61
|
+
else
|
62
|
+
raise "the first element of the block #{i} is invalid: (#{block.first})\n"
|
63
|
+
end
|
64
|
+
}
|
65
|
+
end
|
66
|
+
attr_accessor :raw_list
|
67
|
+
|
68
|
+
def former_only()
|
69
|
+
elms = self.dup.delete_if{|e| e[0] == :add_elt}
|
70
|
+
elms.collect!{|e|
|
71
|
+
if e[0] == :change_elt
|
72
|
+
[e[0], e[1], nil]
|
73
|
+
else
|
74
|
+
e
|
75
|
+
end
|
76
|
+
}
|
77
|
+
return elms
|
78
|
+
end
|
79
|
+
|
80
|
+
def latter_only()
|
81
|
+
elms = self.dup.delete_if{|e| e[0] == :del_elt}
|
82
|
+
elms.collect!{|e|
|
83
|
+
if e[0] == :change_elt
|
84
|
+
[e[0], nil, e[2]]
|
85
|
+
else
|
86
|
+
e
|
87
|
+
end
|
88
|
+
}
|
89
|
+
return elms
|
90
|
+
end
|
91
|
+
|
92
|
+
end # class Difference
|
@@ -0,0 +1,127 @@
|
|
1
|
+
# Document class, a part of DocDiff
|
2
|
+
# 2004-01-14.. Hisashi MORITA
|
3
|
+
|
4
|
+
require 'docdiff/charstring'
|
5
|
+
|
6
|
+
class EncodingDetectionFailure < Exception
|
7
|
+
end
|
8
|
+
class EOLDetectionFailure < Exception
|
9
|
+
end
|
10
|
+
|
11
|
+
class Document
|
12
|
+
|
13
|
+
def initialize(str, enc = nil, e = nil)
|
14
|
+
@body = str
|
15
|
+
@body.extend CharString
|
16
|
+
if enc
|
17
|
+
@body.encoding = enc
|
18
|
+
elsif !@body.encoding
|
19
|
+
guessed_encoding = CharString.guess_encoding(str)
|
20
|
+
if guessed_encoding == "UNKNOWN"
|
21
|
+
raise EncodingDetectionFailure, "encoding not specified, and auto detection failed."
|
22
|
+
# @body.encoding = 'ASCII' # default to ASCII <= BAD!
|
23
|
+
else
|
24
|
+
@body.encoding = guessed_encoding
|
25
|
+
end
|
26
|
+
end
|
27
|
+
if e
|
28
|
+
@body.eol = e
|
29
|
+
else
|
30
|
+
guessed_eol = CharString.guess_eol(str)
|
31
|
+
if guessed_eol == "UNKNOWN"
|
32
|
+
raise EOLDetectionFailure, "eol not specified, and auto detection failed."
|
33
|
+
# @body.eol = 'LF' # default to LF
|
34
|
+
else
|
35
|
+
@body.eol = guessed_eol
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
def encoding()
|
40
|
+
@body.encoding
|
41
|
+
end
|
42
|
+
def encoding=(cs)
|
43
|
+
@body.encoding = cs
|
44
|
+
end
|
45
|
+
def eol()
|
46
|
+
@body.eol
|
47
|
+
end
|
48
|
+
def eol=(eolstr)
|
49
|
+
@body.eol = eolstr
|
50
|
+
end
|
51
|
+
|
52
|
+
def split_to_line()
|
53
|
+
@body.split_to_line
|
54
|
+
end
|
55
|
+
def split_to_word()
|
56
|
+
@body.split_to_word
|
57
|
+
end
|
58
|
+
def split_to_char()
|
59
|
+
@body.split_to_char
|
60
|
+
end
|
61
|
+
def split_to_byte()
|
62
|
+
@body.split_to_byte
|
63
|
+
end
|
64
|
+
|
65
|
+
def count_line()
|
66
|
+
@body.count_line
|
67
|
+
end
|
68
|
+
def count_blank_line()
|
69
|
+
@body.count_blank_line
|
70
|
+
end
|
71
|
+
def count_empty_line()
|
72
|
+
@body.count_empty_line
|
73
|
+
end
|
74
|
+
def count_graph_line()
|
75
|
+
@body.count_graph_line
|
76
|
+
end
|
77
|
+
|
78
|
+
def count_word()
|
79
|
+
@body.count_word
|
80
|
+
end
|
81
|
+
def count_latin_word()
|
82
|
+
@body.count_latin_word
|
83
|
+
end
|
84
|
+
def count_ja_word()
|
85
|
+
@body.count_ja_word
|
86
|
+
end
|
87
|
+
def count_valid_word()
|
88
|
+
@body.count_valid_word
|
89
|
+
end
|
90
|
+
def count_latin_valid_word()
|
91
|
+
@body.count_latin_valid_word
|
92
|
+
end
|
93
|
+
def count_ja_valid_word()
|
94
|
+
@body.count_ja_valid_word
|
95
|
+
end
|
96
|
+
|
97
|
+
def count_char()
|
98
|
+
@body.count_char
|
99
|
+
end
|
100
|
+
def count_blank_char()
|
101
|
+
@body.count_blank_char
|
102
|
+
end
|
103
|
+
def count_graph_char()
|
104
|
+
@body.count_graph_char
|
105
|
+
end
|
106
|
+
def count_latin_blank_char()
|
107
|
+
@body.count_latin_blank_char
|
108
|
+
end
|
109
|
+
def count_latin_graph_char()
|
110
|
+
@body.count_latin_graph_char
|
111
|
+
end
|
112
|
+
def count_ja_blank_char()
|
113
|
+
@body.count_ja_blank_char
|
114
|
+
end
|
115
|
+
def count_ja_graph_char()
|
116
|
+
@body.count_ja_graph_char
|
117
|
+
end
|
118
|
+
|
119
|
+
def count_byte()
|
120
|
+
@body.count_byte
|
121
|
+
end
|
122
|
+
|
123
|
+
def eol_char()
|
124
|
+
@body.eol_char
|
125
|
+
end
|
126
|
+
|
127
|
+
end # class Document
|
@@ -0,0 +1,97 @@
|
|
1
|
+
# English ASCII encoding module for CharString
|
2
|
+
# 2003- Hisashi MORITA
|
3
|
+
|
4
|
+
module CharString
|
5
|
+
module ASCII
|
6
|
+
|
7
|
+
Encoding = "US-ASCII"
|
8
|
+
|
9
|
+
CNTRL = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09" \
|
10
|
+
"\x0a\x0b\x0c\x0d\x0e\x0f\x10\x11\x12\x13" \
|
11
|
+
"\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d" \
|
12
|
+
"\x1e\x1f\x7f"
|
13
|
+
SPACE = "\x09\x0a\x0b\x0c\x0d\x20"
|
14
|
+
BLANK = "\x09\x20"
|
15
|
+
DIGIT = "\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39"
|
16
|
+
ALPHA = "\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4a" \
|
17
|
+
"\x4b\x4c\x4d\x4e\x4f\x50\x51\x52\x53\x54" \
|
18
|
+
"\x55\x56\x57\x58\x59\x5a\x61\x62\x63\x64" \
|
19
|
+
"\x65\x66\x67\x68\x69\x6a\x6b\x6c\x6d\x6e" \
|
20
|
+
"\x6f\x70\x71\x72\x73\x74\x75\x76\x77\x78" \
|
21
|
+
"\x79\x7a"
|
22
|
+
ALNUM = "\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39" \
|
23
|
+
"\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4a" \
|
24
|
+
"\x4b\x4c\x4d\x4e\x4f\x50\x51\x52\x53\x54" \
|
25
|
+
"\x55\x56\x57\x58\x59\x5a\x61\x62\x63\x64" \
|
26
|
+
"\x65\x66\x67\x68\x69\x6a\x6b\x6c\x6d\x6e" \
|
27
|
+
"\x6f\x70\x71\x72\x73\x74\x75\x76\x77\x78" \
|
28
|
+
"\x79\x7a"
|
29
|
+
PUNCT = "\x21\x22\x23\x24\x25\x26\x27\x28\x29\x2a" \
|
30
|
+
"\x2b\x2c\x2d\x2e\x2f\x3a\x3b\x3c\x3d\x3e" \
|
31
|
+
"\x3f\x40\x5b\x5c\x5d\x5e\x5f\x60\x7b\x7c" \
|
32
|
+
"\x7d\x7e"
|
33
|
+
LOWER = "\x61\x62\x63\x64\x65\x66\x67\x68\x69\x6a" \
|
34
|
+
"\x6b\x6c\x6d\x6e\x6f\x70\x71\x72\x73\x74" \
|
35
|
+
"\x75\x76\x77\x78\x79\x7a"
|
36
|
+
UPPER = "\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4a" \
|
37
|
+
"\x4b\x4c\x4d\x4e\x4f\x50\x51\x52\x53\x54" \
|
38
|
+
"\x55\x56\x57\x58\x59\x5a"
|
39
|
+
PRINT = "\x20\x21\x22\x23\x24\x25\x26\x27\x28\x29" \
|
40
|
+
"\x2a\x2b\x2c\x2d\x2e\x2f\x30\x31\x32\x33" \
|
41
|
+
"\x34\x35\x36\x37\x38\x39\x3a\x3b\x3c\x3d" \
|
42
|
+
"\x3e\x3f\x40\x41\x42\x43\x44\x45\x46\x47" \
|
43
|
+
"\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f\x50\x51" \
|
44
|
+
"\x52\x53\x54\x55\x56\x57\x58\x59\x5a\x5b" \
|
45
|
+
"\x5c\x5d\x5e\x5f\x60\x61\x62\x63\x64\x65" \
|
46
|
+
"\x66\x67\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f" \
|
47
|
+
"\x70\x71\x72\x73\x74\x75\x76\x77\x78\x79" \
|
48
|
+
"\x7a\x7b\x7c\x7d\x7e"
|
49
|
+
GRAPH = "\x21\x22\x23\x24\x25\x26\x27\x28\x29\x2a" \
|
50
|
+
"\x2b\x2c\x2d\x2e\x2f\x30\x31\x32\x33\x34" \
|
51
|
+
"\x35\x36\x37\x38\x39\x3a\x3b\x3c\x3d\x3e" \
|
52
|
+
"\x3f\x40\x41\x42\x43\x44\x45\x46\x47\x48" \
|
53
|
+
"\x49\x4a\x4b\x4c\x4d\x4e\x4f\x50\x51\x52" \
|
54
|
+
"\x53\x54\x55\x56\x57\x58\x59\x5a\x5b\x5c" \
|
55
|
+
"\x5d\x5e\x5f\x60\x61\x62\x63\x64\x65\x66" \
|
56
|
+
"\x67\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f\x70" \
|
57
|
+
"\x71\x72\x73\x74\x75\x76\x77\x78\x79\x7a" \
|
58
|
+
"\x7b\x7c\x7d\x7e"
|
59
|
+
XDIGIT = "\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39" \
|
60
|
+
"\x41\x42\x43\x44\x45\x46\x61\x62\x63\x64" \
|
61
|
+
"\x65\x66"
|
62
|
+
|
63
|
+
JA_BLANK = "" # kludge...
|
64
|
+
JA_GRAPH = "" # kludge...
|
65
|
+
|
66
|
+
PUNCT.replace(Regexp.quote(PUNCT)) # kludge to avoid warning "character class has `[' without escape"
|
67
|
+
PRINT.replace(Regexp.quote(PRINT)) # kludge to avoid warning "character class has `[' without escape"
|
68
|
+
GRAPH.replace(Regexp.quote(GRAPH)) # kludge to avoid warning "character class has `[' without escape"
|
69
|
+
|
70
|
+
WORD_REGEXP_SRC = ["(?:[#{GRAPH}]+[#{BLANK}]?)",
|
71
|
+
"|(?:[#{SPACE}]+)",
|
72
|
+
"|(?:.+?)"].join
|
73
|
+
|
74
|
+
# override default method, as ASCII has no Japanese in it
|
75
|
+
def count_ja_graph_char()
|
76
|
+
0
|
77
|
+
end
|
78
|
+
|
79
|
+
# override default method, as ASCII has no Japanese in it
|
80
|
+
def count_ja_blank_char()
|
81
|
+
0
|
82
|
+
end
|
83
|
+
|
84
|
+
# override default method, as ASCII has no Japanese in it
|
85
|
+
def count_ja_word()
|
86
|
+
0
|
87
|
+
end
|
88
|
+
|
89
|
+
# override default method, as ASCII has no Japanese in it
|
90
|
+
def count_ja_valid_word()
|
91
|
+
0
|
92
|
+
end
|
93
|
+
|
94
|
+
CharString.register_encoding(self)
|
95
|
+
|
96
|
+
end # module ASCII
|
97
|
+
end
|
@@ -0,0 +1,269 @@
|
|
1
|
+
# Japanese EUC-JP encoding module for CharString
|
2
|
+
# 2003- Hisashi MORITA
|
3
|
+
|
4
|
+
module CharString
|
5
|
+
module EUC_JP
|
6
|
+
|
7
|
+
Encoding = "EUC-JP"
|
8
|
+
|
9
|
+
# character table based on:
|
10
|
+
# ftp://ftp.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/JIS0208.TXT
|
11
|
+
|
12
|
+
CNTRL = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09" \
|
13
|
+
"\x0a\x0b\x0c\x0d\x0e\x0f\x10\x11\x12\x13" \
|
14
|
+
"\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d" \
|
15
|
+
"\x1e\x1f\x7f"
|
16
|
+
SPACE = "\x09\x0a\x0b\x0c\x0d\x20"
|
17
|
+
BLANK = "\x09\x20"
|
18
|
+
DIGIT = "\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39"
|
19
|
+
ALPHA = "\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4a" \
|
20
|
+
"\x4b\x4c\x4d\x4e\x4f\x50\x51\x52\x53\x54" \
|
21
|
+
"\x55\x56\x57\x58\x59\x5a\x61\x62\x63\x64" \
|
22
|
+
"\x65\x66\x67\x68\x69\x6a\x6b\x6c\x6d\x6e" \
|
23
|
+
"\x6f\x70\x71\x72\x73\x74\x75\x76\x77\x78" \
|
24
|
+
"\x79\x7a"
|
25
|
+
ALNUM = "\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39" \
|
26
|
+
"\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4a" \
|
27
|
+
"\x4b\x4c\x4d\x4e\x4f\x50\x51\x52\x53\x54" \
|
28
|
+
"\x55\x56\x57\x58\x59\x5a\x61\x62\x63\x64" \
|
29
|
+
"\x65\x66\x67\x68\x69\x6a\x6b\x6c\x6d\x6e" \
|
30
|
+
"\x6f\x70\x71\x72\x73\x74\x75\x76\x77\x78" \
|
31
|
+
"\x79\x7a"
|
32
|
+
PUNCT = "\x21\x22\x23\x24\x25\x26\x27\x28\x29\x2a" \
|
33
|
+
"\x2b\x2c\x2d\x2e\x2f\x3a\x3b\x3c\x3d\x3e" \
|
34
|
+
"\x3f\x40\x5b\x5c\x5d\x5e\x5f\x60\x7b\x7c" \
|
35
|
+
"\x7d\x7e"
|
36
|
+
LOWER = "\x61\x62\x63\x64\x65\x66\x67\x68\x69\x6a" \
|
37
|
+
"\x6b\x6c\x6d\x6e\x6f\x70\x71\x72\x73\x74" \
|
38
|
+
"\x75\x76\x77\x78\x79\x7a"
|
39
|
+
UPPER = "\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4a" \
|
40
|
+
"\x4b\x4c\x4d\x4e\x4f\x50\x51\x52\x53\x54" \
|
41
|
+
"\x55\x56\x57\x58\x59\x5a"
|
42
|
+
PRINT = "\x20\x21\x22\x23\x24\x25\x26\x27\x28\x29" \
|
43
|
+
"\x2a\x2b\x2c\x2d\x2e\x2f\x30\x31\x32\x33" \
|
44
|
+
"\x34\x35\x36\x37\x38\x39\x3a\x3b\x3c\x3d" \
|
45
|
+
"\x3e\x3f\x40\x41\x42\x43\x44\x45\x46\x47" \
|
46
|
+
"\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f\x50\x51" \
|
47
|
+
"\x52\x53\x54\x55\x56\x57\x58\x59\x5a\x5b" \
|
48
|
+
"\x5c\x5d\x5e\x5f\x60\x61\x62\x63\x64\x65" \
|
49
|
+
"\x66\x67\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f" \
|
50
|
+
"\x70\x71\x72\x73\x74\x75\x76\x77\x78\x79" \
|
51
|
+
"\x7a\x7b\x7c\x7d\x7e"
|
52
|
+
GRAPH = "\x21\x22\x23\x24\x25\x26\x27\x28\x29\x2a" \
|
53
|
+
"\x2b\x2c\x2d\x2e\x2f\x30\x31\x32\x33\x34" \
|
54
|
+
"\x35\x36\x37\x38\x39\x3a\x3b\x3c\x3d\x3e" \
|
55
|
+
"\x3f\x40\x41\x42\x43\x44\x45\x46\x47\x48" \
|
56
|
+
"\x49\x4a\x4b\x4c\x4d\x4e\x4f\x50\x51\x52" \
|
57
|
+
"\x53\x54\x55\x56\x57\x58\x59\x5a\x5b\x5c" \
|
58
|
+
"\x5d\x5e\x5f\x60\x61\x62\x63\x64\x65\x66" \
|
59
|
+
"\x67\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f\x70" \
|
60
|
+
"\x71\x72\x73\x74\x75\x76\x77\x78\x79\x7a" \
|
61
|
+
"\x7b\x7c\x7d\x7e"
|
62
|
+
XDIGIT = "\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39" \
|
63
|
+
"\x41\x42\x43\x44\x45\x46\x61\x62\x63\x64" \
|
64
|
+
"\x65\x66"
|
65
|
+
JA_SPACE = "\xa1\xa1"
|
66
|
+
JA_BLANK = "\xa1\xa1"
|
67
|
+
JA_ALNUM = "\xa3\xb0\xa3\xb1\xa3\xb2\xa3\xb3\xa3\xb4" \
|
68
|
+
"\xa3\xb5\xa3\xb6\xa3\xb7\xa3\xb8\xa3\xb9" \
|
69
|
+
"\xa3\xc1\xa3\xc2\xa3\xc3\xa3\xc4\xa3\xc5" \
|
70
|
+
"\xa3\xc6\xa3\xc7\xa3\xc8\xa3\xc9\xa3\xca" \
|
71
|
+
"\xa3\xcb\xa3\xcc\xa3\xcd\xa3\xce\xa3\xcf" \
|
72
|
+
"\xa3\xd0\xa3\xd1\xa3\xd2\xa3\xd3\xa3\xd4" \
|
73
|
+
"\xa3\xd5\xa3\xd6\xa3\xd7\xa3\xd8\xa3\xd9" \
|
74
|
+
"\xa3\xda\xa3\xe1\xa3\xe2\xa3\xe3\xa3\xe4" \
|
75
|
+
"\xa3\xe5\xa3\xe6\xa3\xe7\xa3\xe8\xa3\xe9" \
|
76
|
+
"\xa3\xea\xa3\xeb\xa3\xec\xa3\xed\xa3\xee" \
|
77
|
+
"\xa3\xef\xa3\xf0\xa3\xf1\xa3\xf2\xa3\xf3" \
|
78
|
+
"\xa3\xf4\xa3\xf5\xa3\xf6\xa3\xf7\xa3\xf8" \
|
79
|
+
"\xa3\xf9\xa3\xfa"
|
80
|
+
JA_PUNCT = "\xa1\xa2\xa1\xa3\xa1\xa4\xa1\xa5\xa1\xa6" \
|
81
|
+
"\xa1\xa7\xa1\xa8\xa1\xa9\xa1\xaa\xa1\xab" \
|
82
|
+
"\xa1\xac\xa1\xad\xa1\xae\xa1\xaf\xa1\xb0" \
|
83
|
+
"\xa1\xb1\xa1\xb2\xa1\xb3\xa1\xb4\xa1\xb5" \
|
84
|
+
"\xa1\xb6\xa1\xb7\xa1\xb8\xa1\xb9\xa1\xba" \
|
85
|
+
"\xa1\xbb\xa1\xbc\xa1\xbd\xa1\xbe\xa1\xbf" \
|
86
|
+
"\xa1\xc0\xa1\xc1\xa1\xc2\xa1\xc3\xa1\xc4" \
|
87
|
+
"\xa1\xc5\xa1\xc6\xa1\xc7\xa1\xc8\xa1\xc9" \
|
88
|
+
"\xa1\xca\xa1\xcb\xa1\xcc\xa1\xcd\xa1\xce" \
|
89
|
+
"\xa1\xcf\xa1\xd0\xa1\xd1\xa1\xd2\xa1\xd3" \
|
90
|
+
"\xa1\xd4\xa1\xd5\xa1\xd6\xa1\xd7\xa1\xd8" \
|
91
|
+
"\xa1\xd9\xa1\xda\xa1\xdb\xa1\xdc\xa1\xdd" \
|
92
|
+
"\xa1\xde\xa1\xdf\xa1\xe0\xa1\xe1\xa1\xe2" \
|
93
|
+
"\xa1\xe3\xa1\xe4\xa1\xe5\xa1\xe6\xa1\xe7" \
|
94
|
+
"\xa1\xe8\xa1\xe9\xa1\xea\xa1\xeb\xa1\xec" \
|
95
|
+
"\xa1\xed\xa1\xee\xa1\xef\xa1\xf0\xa1\xf1" \
|
96
|
+
"\xa1\xf2\xa1\xf3\xa1\xf4\xa1\xf5\xa1\xf6" \
|
97
|
+
"\xa1\xf7\xa1\xf8\xa1\xf9\xa1\xfa\xa1\xfb" \
|
98
|
+
"\xa1\xfc\xa1\xfd\xa1\xfe\xa2\xa1\xa2\xa2" \
|
99
|
+
"\xa2\xa3\xa2\xa4\xa2\xa5\xa2\xa6\xa2\xa7" \
|
100
|
+
"\xa2\xa8\xa2\xa9\xa2\xaa\xa2\xab\xa2\xac" \
|
101
|
+
"\xa2\xad\xa2\xae\xa2\xba\xa2\xbb\xa2\xbc" \
|
102
|
+
"\xa2\xbd\xa2\xbe\xa2\xbf\xa2\xc0\xa2\xc1" \
|
103
|
+
"\xa2\xca\xa2\xcb\xa2\xcc\xa2\xcd\xa2\xce" \
|
104
|
+
"\xa2\xcf\xa2\xd0\xa2\xdc\xa2\xdd\xa2\xde" \
|
105
|
+
"\xa2\xdf\xa2\xe0\xa2\xe1\xa2\xe2\xa2\xe3" \
|
106
|
+
"\xa2\xe4\xa2\xe5\xa2\xe6\xa2\xe7\xa2\xe8" \
|
107
|
+
"\xa2\xe9\xa2\xea\xa2\xf2\xa2\xf3\xa2\xf4" \
|
108
|
+
"\xa2\xf5\xa2\xf6\xa2\xf7\xa2\xf8\xa2\xf9" \
|
109
|
+
"\xa2\xfe\xa6\xa1\xa6\xa2\xa6\xa3\xa6\xa4" \
|
110
|
+
"\xa6\xa5\xa6\xa6\xa6\xa7\xa6\xa8\xa6\xa9" \
|
111
|
+
"\xa6\xaa\xa6\xab\xa6\xac\xa6\xad\xa6\xae" \
|
112
|
+
"\xa6\xaf\xa6\xb0\xa6\xb1\xa6\xb2\xa6\xb3" \
|
113
|
+
"\xa6\xb4\xa6\xb5\xa6\xb6\xa6\xb7\xa6\xb8" \
|
114
|
+
"\xa6\xc1\xa6\xc2\xa6\xc3\xa6\xc4\xa6\xc5" \
|
115
|
+
"\xa6\xc6\xa6\xc7\xa6\xc8\xa6\xc9\xa6\xca" \
|
116
|
+
"\xa6\xcb\xa6\xcc\xa6\xcd\xa6\xce\xa6\xcf" \
|
117
|
+
"\xa6\xd0\xa6\xd1\xa6\xd2\xa6\xd3\xa6\xd4" \
|
118
|
+
"\xa6\xd5\xa6\xd6\xa6\xd7\xa6\xd8\xa7\xa1" \
|
119
|
+
"\xa7\xa2\xa7\xa3\xa7\xa4\xa7\xa5\xa7\xa6" \
|
120
|
+
"\xa7\xa7\xa7\xa8\xa7\xa9\xa7\xaa\xa7\xab" \
|
121
|
+
"\xa7\xac\xa7\xad\xa7\xae\xa7\xaf\xa7\xb0" \
|
122
|
+
"\xa7\xb1\xa7\xb2\xa7\xb3\xa7\xb4\xa7\xb5" \
|
123
|
+
"\xa7\xb6\xa7\xb7\xa7\xb8\xa7\xb9\xa7\xba" \
|
124
|
+
"\xa7\xbb\xa7\xbc\xa7\xbd\xa7\xbe\xa7\xbf" \
|
125
|
+
"\xa7\xc0\xa7\xc1\xa7\xd1\xa7\xd2\xa7\xd3" \
|
126
|
+
"\xa7\xd4\xa7\xd5\xa7\xd6\xa7\xd7\xa7\xd8" \
|
127
|
+
"\xa7\xd9\xa7\xda\xa7\xdb\xa7\xdc\xa7\xdd" \
|
128
|
+
"\xa7\xde\xa7\xdf\xa7\xe0\xa7\xe1\xa7\xe2" \
|
129
|
+
"\xa7\xe3\xa7\xe4\xa7\xe5\xa7\xe6\xa7\xe7" \
|
130
|
+
"\xa7\xe8\xa7\xe9\xa7\xea\xa7\xeb\xa7\xec" \
|
131
|
+
"\xa7\xed\xa7\xee\xa7\xef\xa7\xf0\xa7\xf1" \
|
132
|
+
"\xa8\xa1\xa8\xa2\xa8\xa3\xa8\xa4\xa8\xa5" \
|
133
|
+
"\xa8\xa6\xa8\xa7\xa8\xa8\xa8\xa9\xa8\xaa" \
|
134
|
+
"\xa8\xab\xa8\xac\xa8\xad\xa8\xae\xa8\xaf" \
|
135
|
+
"\xa8\xb0\xa8\xb1\xa8\xb2\xa8\xb3\xa8\xb4" \
|
136
|
+
"\xa8\xb5\xa8\xb6\xa8\xb7\xa8\xb8\xa8\xb9" \
|
137
|
+
"\xa8\xba\xa8\xbb\xa8\xbc\xa8\xbd\xa8\xbe" \
|
138
|
+
"\xa8\xbf\xa8\xc0"
|
139
|
+
HIRA = "\xa4\xa1\xa4\xa2\xa4\xa3\xa4\xa4\xa4\xa5" \
|
140
|
+
"\xa4\xa6\xa4\xa7\xa4\xa8\xa4\xa9\xa4\xaa" \
|
141
|
+
"\xa4\xab\xa4\xac\xa4\xad\xa4\xae\xa4\xaf" \
|
142
|
+
"\xa4\xb0\xa4\xb1\xa4\xb2\xa4\xb3\xa4\xb4" \
|
143
|
+
"\xa4\xb5\xa4\xb6\xa4\xb7\xa4\xb8\xa4\xb9" \
|
144
|
+
"\xa4\xba\xa4\xbb\xa4\xbc\xa4\xbd\xa4\xbe" \
|
145
|
+
"\xa4\xbf\xa4\xc0\xa4\xc1\xa4\xc2\xa4\xc3" \
|
146
|
+
"\xa4\xc4\xa4\xc5\xa4\xc6\xa4\xc7\xa4\xc8" \
|
147
|
+
"\xa4\xc9\xa4\xca\xa4\xcb\xa4\xcc\xa4\xcd" \
|
148
|
+
"\xa4\xce\xa4\xcf\xa4\xd0\xa4\xd1\xa4\xd2" \
|
149
|
+
"\xa4\xd3\xa4\xd4\xa4\xd5\xa4\xd6\xa4\xd7" \
|
150
|
+
"\xa4\xd8\xa4\xd9\xa4\xda\xa4\xdb\xa4\xdc" \
|
151
|
+
"\xa4\xdd\xa4\xde\xa4\xdf\xa4\xe0\xa4\xe1" \
|
152
|
+
"\xa4\xe2\xa4\xe3\xa4\xe4\xa4\xe5\xa4\xe6" \
|
153
|
+
"\xa4\xe7\xa4\xe8\xa4\xe9\xa4\xea\xa4\xeb" \
|
154
|
+
"\xa4\xec\xa4\xed\xa4\xee\xa4\xef\xa4\xf0" \
|
155
|
+
"\xa4\xf1\xa4\xf2\xa4\xf3" # [\xa4\xa1-\xa4\xf3]
|
156
|
+
HIRA_EX = HIRA +
|
157
|
+
"\xa1\xbc" +
|
158
|
+
"\xa1\xb3\xa1\xb4" # add onbiki and kanagaeshi(hira)
|
159
|
+
KATA = "\xa5\xa1\xa5\xa2\xa5\xa3\xa5\xa4\xa5\xa5" \
|
160
|
+
"\xa5\xa6\xa5\xa7\xa5\xa8\xa5\xa9\xa5\xaa" \
|
161
|
+
"\xa5\xab\xa5\xac\xa5\xad\xa5\xae\xa5\xaf" \
|
162
|
+
"\xa5\xb0\xa5\xb1\xa5\xb2\xa5\xb3\xa5\xb4" \
|
163
|
+
"\xa5\xb5\xa5\xb6\xa5\xb7\xa5\xb8\xa5\xb9" \
|
164
|
+
"\xa5\xba\xa5\xbb\xa5\xbc\xa5\xbd\xa5\xbe" \
|
165
|
+
"\xa5\xbf\xa5\xc0\xa5\xc1\xa5\xc2\xa5\xc3" \
|
166
|
+
"\xa5\xc4\xa5\xc5\xa5\xc6\xa5\xc7\xa5\xc8" \
|
167
|
+
"\xa5\xc9\xa5\xca\xa5\xcb\xa5\xcc\xa5\xcd" \
|
168
|
+
"\xa5\xce\xa5\xcf\xa5\xd0\xa5\xd1\xa5\xd2" \
|
169
|
+
"\xa5\xd3\xa5\xd4\xa5\xd5\xa5\xd6\xa5\xd7" \
|
170
|
+
"\xa5\xd8\xa5\xd9\xa5\xda\xa5\xdb\xa5\xdc" \
|
171
|
+
"\xa5\xdd\xa5\xde\xa5\xdf\xa5\xe0\xa5\xe1" \
|
172
|
+
"\xa5\xe2\xa5\xe3\xa5\xe4\xa5\xe5\xa5\xe6" \
|
173
|
+
"\xa5\xe7\xa5\xe8\xa5\xe9\xa5\xea\xa5\xeb" \
|
174
|
+
"\xa5\xec\xa5\xed\xa5\xee\xa5\xef\xa5\xf0" \
|
175
|
+
"\xa5\xf1\xa5\xf2\xa5\xf3\xa5\xf4\xa5\xf5" \
|
176
|
+
"\xa5\xf6" # [\xa5\xa1-\xa5\xf6]
|
177
|
+
KATA_EX = KATA +
|
178
|
+
"\xa1\xbc" +
|
179
|
+
"\xa1\xb5\xa1\xb6" # add onbiki and kanagaeshi(kata)
|
180
|
+
KANJI = "\xb0\xa1-\xb0\xfe" \
|
181
|
+
"\xb1\xa1-\xb1\xfe" \
|
182
|
+
"\xb2\xa1-\xb2\xfe" \
|
183
|
+
"\xb3\xa1-\xb3\xfe" \
|
184
|
+
"\xb4\xa1-\xb4\xfe" \
|
185
|
+
"\xb5\xa1-\xb5\xfe" \
|
186
|
+
"\xb6\xa1-\xb6\xfe" \
|
187
|
+
"\xb7\xa1-\xb7\xfe" \
|
188
|
+
"\xb8\xa1-\xb8\xfe" \
|
189
|
+
"\xb9\xa1-\xb9\xfe" \
|
190
|
+
"\xba\xa1-\xba\xfe" \
|
191
|
+
"\xbb\xa1-\xbb\xfe" \
|
192
|
+
"\xbc\xa1-\xbc\xfe" \
|
193
|
+
"\xbd\xa1-\xbd\xfe" \
|
194
|
+
"\xbe\xa1-\xbe\xfe" \
|
195
|
+
"\xbf\xa1-\xbf\xfe" \
|
196
|
+
"\xc0\xa1-\xc0\xfe" \
|
197
|
+
"\xc1\xa1-\xc1\xfe" \
|
198
|
+
"\xc2\xa1-\xc2\xfe" \
|
199
|
+
"\xc3\xa1-\xc3\xfe" \
|
200
|
+
"\xc4\xa1-\xc4\xfe" \
|
201
|
+
"\xc5\xa1-\xc5\xfe" \
|
202
|
+
"\xc6\xa1-\xc6\xfe" \
|
203
|
+
"\xc7\xa1-\xc7\xfe" \
|
204
|
+
"\xc8\xa1-\xc8\xfe" \
|
205
|
+
"\xc9\xa1-\xc9\xfe" \
|
206
|
+
"\xca\xa1-\xca\xfe" \
|
207
|
+
"\xcb\xa1-\xcb\xfe" \
|
208
|
+
"\xcc\xa1-\xcc\xfe" \
|
209
|
+
"\xcd\xa1-\xcd\xfe" \
|
210
|
+
"\xce\xa1-\xce\xfe" \
|
211
|
+
"\xcf\xa1-\xcf\xd3" \
|
212
|
+
"\xd0\xa1-\xd0\xfe" \
|
213
|
+
"\xd1\xa1-\xd1\xfe" \
|
214
|
+
"\xd2\xa1-\xd2\xfe" \
|
215
|
+
"\xd3\xa1-\xd3\xfe" \
|
216
|
+
"\xd4\xa1-\xd4\xfe" \
|
217
|
+
"\xd5\xa1-\xd5\xfe" \
|
218
|
+
"\xd6\xa1-\xd6\xfe" \
|
219
|
+
"\xd7\xa1-\xd7\xfe" \
|
220
|
+
"\xd8\xa1-\xd8\xfe" \
|
221
|
+
"\xd9\xa1-\xd9\xfe" \
|
222
|
+
"\xda\xa1-\xda\xfe" \
|
223
|
+
"\xdb\xa1-\xdb\xfe" \
|
224
|
+
"\xdc\xa1-\xdc\xfe" \
|
225
|
+
"\xdd\xa1-\xdd\xfe" \
|
226
|
+
"\xde\xa1-\xde\xfe" \
|
227
|
+
"\xdf\xa1-\xdf\xfe" \
|
228
|
+
"\xe0\xa1-\xe0\xfe" \
|
229
|
+
"\xe1\xa1-\xe1\xfe" \
|
230
|
+
"\xe2\xa1-\xe2\xfe" \
|
231
|
+
"\xe3\xa1-\xe3\xfe" \
|
232
|
+
"\xe4\xa1-\xe4\xfe" \
|
233
|
+
"\xe5\xa1-\xe5\xfe" \
|
234
|
+
"\xe6\xa1-\xe6\xfe" \
|
235
|
+
"\xe7\xa1-\xe7\xfe" \
|
236
|
+
"\xe8\xa1-\xe8\xfe" \
|
237
|
+
"\xe9\xa1-\xe9\xfe" \
|
238
|
+
"\xea\xa1-\xea\xfe" \
|
239
|
+
"\xeb\xa1-\xeb\xfe" \
|
240
|
+
"\xec\xa1-\xec\xfe" \
|
241
|
+
"\xed\xa1-\xed\xfe" \
|
242
|
+
"\xee\xa1-\xee\xfe" \
|
243
|
+
"\xef\xa1-\xef\xfe" \
|
244
|
+
"\xf0\xa1-\xf0\xfe" \
|
245
|
+
"\xf1\xa1-\xf1\xfe" \
|
246
|
+
"\xf2\xa1-\xf2\xfe" \
|
247
|
+
"\xf3\xa1-\xf3\xfe" \
|
248
|
+
"\xf4\xa1-\xf4\xa6"
|
249
|
+
KANJI_EX = KANJI + "\xa1\xb9" # + noma
|
250
|
+
JA_GRAPH = JA_ALNUM + JA_PUNCT + HIRA + KATA + KANJI
|
251
|
+
JA_PRINT = JA_GRAPH + JA_BLANK
|
252
|
+
|
253
|
+
PUNCT.replace(Regexp.quote(PUNCT)) # kludge to avoid warning "character class has `[' without escape"
|
254
|
+
PRINT.replace(Regexp.quote(PRINT)) # kludge to avoid warning "character class has `[' without escape"
|
255
|
+
GRAPH.replace(Regexp.quote(GRAPH)) # kludge to avoid warning "character class has `[' without escape"
|
256
|
+
|
257
|
+
WORD_REGEXP_SRC = ["(?:[#{GRAPH}]+[#{BLANK}]?)",
|
258
|
+
"|(?:[#{SPACE}]+)",
|
259
|
+
"|(?:[#{KANJI_EX}]+[#{HIRA}]+)",
|
260
|
+
"|(?:[#{KATA_EX}]+[#{HIRA}]+)",
|
261
|
+
"|(?:[#{KANJI_EX}]+)",
|
262
|
+
"|(?:[#{KATA_EX}]+)",
|
263
|
+
"|(?:[#{HIRA_EX}]+)",
|
264
|
+
"|(?:.+?)"].join
|
265
|
+
|
266
|
+
CharString.register_encoding(self)
|
267
|
+
|
268
|
+
end # module EUCJP
|
269
|
+
end
|