docdiff 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (73) hide show
  1. data/.gitignore +6 -0
  2. data/.travis.yml +7 -0
  3. data/Gemfile +17 -0
  4. data/Guardfile +8 -0
  5. data/Makefile +108 -0
  6. data/Rakefile +17 -0
  7. data/bin/docdiff +179 -0
  8. data/devutil/JIS0208.TXT +6952 -0
  9. data/devutil/char_by_charclass.rb +23 -0
  10. data/devutil/charclass_by_char.rb +21 -0
  11. data/devutil/jis0208.rb +343 -0
  12. data/devutil/testjis0208.rb +38 -0
  13. data/docdiff.conf.example +22 -0
  14. data/docdiff.gemspec +23 -0
  15. data/docdiffwebui.cgi +176 -0
  16. data/docdiffwebui.html +123 -0
  17. data/img/docdiff-screenshot-format-html-digest-firefox.png +0 -0
  18. data/img/docdiff-screenshot-format-html-firefox.png +0 -0
  19. data/img/docdiff-screenshot-format-tty-cmdexe-en.png +0 -0
  20. data/img/docdiff-screenshot-format-tty-cmdexe-ja.png +0 -0
  21. data/img/docdiff-screenshot-format-tty-rxvtunicode-en.png +0 -0
  22. data/img/docdiff-screenshot-format-tty-rxvtunicode-ja.png +0 -0
  23. data/img/docdiff-screenshot-format-tty-xterm-en.png +0 -0
  24. data/img/docdiff-screenshot-format-tty-xterm-ja.png +0 -0
  25. data/img/docdiff-screenshot-resolution-linewordchar-xterm.png +0 -0
  26. data/index.html +181 -0
  27. data/langfilter.rb +14 -0
  28. data/lib/doc_diff.rb +170 -0
  29. data/lib/docdiff.rb +7 -0
  30. data/lib/docdiff/charstring.rb +579 -0
  31. data/lib/docdiff/diff.rb +217 -0
  32. data/lib/docdiff/diff/contours.rb +382 -0
  33. data/lib/docdiff/diff/editscript.rb +148 -0
  34. data/lib/docdiff/diff/rcsdiff.rb +107 -0
  35. data/lib/docdiff/diff/shortestpath.rb +93 -0
  36. data/lib/docdiff/diff/speculative.rb +40 -0
  37. data/lib/docdiff/diff/subsequence.rb +39 -0
  38. data/lib/docdiff/diff/unidiff.rb +124 -0
  39. data/lib/docdiff/difference.rb +92 -0
  40. data/lib/docdiff/document.rb +127 -0
  41. data/lib/docdiff/encoding/en_ascii.rb +97 -0
  42. data/lib/docdiff/encoding/ja_eucjp.rb +269 -0
  43. data/lib/docdiff/encoding/ja_sjis.rb +260 -0
  44. data/lib/docdiff/encoding/ja_utf8.rb +6974 -0
  45. data/lib/docdiff/version.rb +3 -0
  46. data/lib/docdiff/view.rb +476 -0
  47. data/lib/viewdiff.rb +375 -0
  48. data/readme.html +713 -0
  49. data/sample/01.en.ascii.cr +1 -0
  50. data/sample/01.en.ascii.crlf +2 -0
  51. data/sample/01.en.ascii.lf +2 -0
  52. data/sample/01.ja.eucjp.lf +2 -0
  53. data/sample/01.ja.sjis.cr +1 -0
  54. data/sample/01.ja.sjis.crlf +2 -0
  55. data/sample/01.ja.utf8.crlf +2 -0
  56. data/sample/02.en.ascii.cr +1 -0
  57. data/sample/02.en.ascii.crlf +2 -0
  58. data/sample/02.en.ascii.lf +2 -0
  59. data/sample/02.ja.eucjp.lf +2 -0
  60. data/sample/02.ja.sjis.cr +1 -0
  61. data/sample/02.ja.sjis.crlf +2 -0
  62. data/sample/02.ja.utf8.crlf +2 -0
  63. data/sample/humpty_dumpty01.ascii.lf +4 -0
  64. data/sample/humpty_dumpty02.ascii.lf +4 -0
  65. data/test/charstring_test.rb +1008 -0
  66. data/test/diff_test.rb +36 -0
  67. data/test/difference_test.rb +64 -0
  68. data/test/docdiff_test.rb +193 -0
  69. data/test/document_test.rb +626 -0
  70. data/test/test_helper.rb +7 -0
  71. data/test/view_test.rb +570 -0
  72. data/test/viewdiff_test.rb +908 -0
  73. metadata +129 -0
@@ -0,0 +1,92 @@
1
+ # Difference class for DocDiff
2
+ # 2003-03-24 ..
3
+ # Hisashi MORITA
4
+
5
+ require 'docdiff/diff'
6
+
7
+ class Difference < Array
8
+
9
+ # @resolution = nil # char, word, phrase, sentence, line, paragraph..
10
+ # @codeset = ''
11
+ # @eol_char = "\n"
12
+ # @source = 'source'
13
+ # @target = 'target'
14
+ # attr_accessor :resolution, :codeset, :eol_char, :source, :target
15
+
16
+ def initialize(array1 = nil, array2 = nil)
17
+ if (array1 == nil) && (array2 == nil)
18
+ return []
19
+ end
20
+ diff = Diff.new(array1, array2)
21
+ @raw_list = []
22
+ diff.ses.each{|block| # Diff::EditScript does not have each_with_index()
23
+ @raw_list << block
24
+ }
25
+ combine_del_add_to_change!()
26
+ end
27
+
28
+ def combine_del_add_to_change!()
29
+
30
+ @raw_list.each_with_index{|block, i|
31
+ case block.first
32
+ when :common_elt_elt
33
+ if i == 0 # first block
34
+ self << block
35
+ else # in-between or the last block
36
+ if @raw_list[i - 1].first == :del_elt # previous block was del
37
+ self << @raw_list[i - 1]
38
+ self << block
39
+ else # previous block was add
40
+ self << block
41
+ end
42
+ end
43
+ when :del_elt
44
+ if i == (@raw_list.size - 1) # last block
45
+ self << block
46
+ else # first block or in-between
47
+ # do nothing, let the next block to decide what to do
48
+ end
49
+ when :add_elt
50
+ if i == 0 # first block
51
+ self << block
52
+ else # in-between or the last block
53
+ if @raw_list[i - 1].first == :del_elt # previous block was del
54
+ deleted = @raw_list[i - 1][1]
55
+ added = @raw_list[i][2]
56
+ self << [:change_elt, deleted, added]
57
+ else # previous block was common
58
+ self << block
59
+ end
60
+ end
61
+ else
62
+ raise "the first element of the block #{i} is invalid: (#{block.first})\n"
63
+ end
64
+ }
65
+ end
66
+ attr_accessor :raw_list
67
+
68
+ def former_only()
69
+ elms = self.dup.delete_if{|e| e[0] == :add_elt}
70
+ elms.collect!{|e|
71
+ if e[0] == :change_elt
72
+ [e[0], e[1], nil]
73
+ else
74
+ e
75
+ end
76
+ }
77
+ return elms
78
+ end
79
+
80
+ def latter_only()
81
+ elms = self.dup.delete_if{|e| e[0] == :del_elt}
82
+ elms.collect!{|e|
83
+ if e[0] == :change_elt
84
+ [e[0], nil, e[2]]
85
+ else
86
+ e
87
+ end
88
+ }
89
+ return elms
90
+ end
91
+
92
+ end # class Difference
@@ -0,0 +1,127 @@
1
+ # Document class, a part of DocDiff
2
+ # 2004-01-14.. Hisashi MORITA
3
+
4
+ require 'docdiff/charstring'
5
+
6
+ class EncodingDetectionFailure < Exception
7
+ end
8
+ class EOLDetectionFailure < Exception
9
+ end
10
+
11
+ class Document
12
+
13
+ def initialize(str, enc = nil, e = nil)
14
+ @body = str
15
+ @body.extend CharString
16
+ if enc
17
+ @body.encoding = enc
18
+ elsif !@body.encoding
19
+ guessed_encoding = CharString.guess_encoding(str)
20
+ if guessed_encoding == "UNKNOWN"
21
+ raise EncodingDetectionFailure, "encoding not specified, and auto detection failed."
22
+ # @body.encoding = 'ASCII' # default to ASCII <= BAD!
23
+ else
24
+ @body.encoding = guessed_encoding
25
+ end
26
+ end
27
+ if e
28
+ @body.eol = e
29
+ else
30
+ guessed_eol = CharString.guess_eol(str)
31
+ if guessed_eol == "UNKNOWN"
32
+ raise EOLDetectionFailure, "eol not specified, and auto detection failed."
33
+ # @body.eol = 'LF' # default to LF
34
+ else
35
+ @body.eol = guessed_eol
36
+ end
37
+ end
38
+ end
39
+ def encoding()
40
+ @body.encoding
41
+ end
42
+ def encoding=(cs)
43
+ @body.encoding = cs
44
+ end
45
+ def eol()
46
+ @body.eol
47
+ end
48
+ def eol=(eolstr)
49
+ @body.eol = eolstr
50
+ end
51
+
52
+ def split_to_line()
53
+ @body.split_to_line
54
+ end
55
+ def split_to_word()
56
+ @body.split_to_word
57
+ end
58
+ def split_to_char()
59
+ @body.split_to_char
60
+ end
61
+ def split_to_byte()
62
+ @body.split_to_byte
63
+ end
64
+
65
+ def count_line()
66
+ @body.count_line
67
+ end
68
+ def count_blank_line()
69
+ @body.count_blank_line
70
+ end
71
+ def count_empty_line()
72
+ @body.count_empty_line
73
+ end
74
+ def count_graph_line()
75
+ @body.count_graph_line
76
+ end
77
+
78
+ def count_word()
79
+ @body.count_word
80
+ end
81
+ def count_latin_word()
82
+ @body.count_latin_word
83
+ end
84
+ def count_ja_word()
85
+ @body.count_ja_word
86
+ end
87
+ def count_valid_word()
88
+ @body.count_valid_word
89
+ end
90
+ def count_latin_valid_word()
91
+ @body.count_latin_valid_word
92
+ end
93
+ def count_ja_valid_word()
94
+ @body.count_ja_valid_word
95
+ end
96
+
97
+ def count_char()
98
+ @body.count_char
99
+ end
100
+ def count_blank_char()
101
+ @body.count_blank_char
102
+ end
103
+ def count_graph_char()
104
+ @body.count_graph_char
105
+ end
106
+ def count_latin_blank_char()
107
+ @body.count_latin_blank_char
108
+ end
109
+ def count_latin_graph_char()
110
+ @body.count_latin_graph_char
111
+ end
112
+ def count_ja_blank_char()
113
+ @body.count_ja_blank_char
114
+ end
115
+ def count_ja_graph_char()
116
+ @body.count_ja_graph_char
117
+ end
118
+
119
+ def count_byte()
120
+ @body.count_byte
121
+ end
122
+
123
+ def eol_char()
124
+ @body.eol_char
125
+ end
126
+
127
+ end # class Document
@@ -0,0 +1,97 @@
1
+ # English ASCII encoding module for CharString
2
+ # 2003- Hisashi MORITA
3
+
4
+ module CharString
5
+ module ASCII
6
+
7
+ Encoding = "US-ASCII"
8
+
9
+ CNTRL = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09" \
10
+ "\x0a\x0b\x0c\x0d\x0e\x0f\x10\x11\x12\x13" \
11
+ "\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d" \
12
+ "\x1e\x1f\x7f"
13
+ SPACE = "\x09\x0a\x0b\x0c\x0d\x20"
14
+ BLANK = "\x09\x20"
15
+ DIGIT = "\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39"
16
+ ALPHA = "\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4a" \
17
+ "\x4b\x4c\x4d\x4e\x4f\x50\x51\x52\x53\x54" \
18
+ "\x55\x56\x57\x58\x59\x5a\x61\x62\x63\x64" \
19
+ "\x65\x66\x67\x68\x69\x6a\x6b\x6c\x6d\x6e" \
20
+ "\x6f\x70\x71\x72\x73\x74\x75\x76\x77\x78" \
21
+ "\x79\x7a"
22
+ ALNUM = "\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39" \
23
+ "\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4a" \
24
+ "\x4b\x4c\x4d\x4e\x4f\x50\x51\x52\x53\x54" \
25
+ "\x55\x56\x57\x58\x59\x5a\x61\x62\x63\x64" \
26
+ "\x65\x66\x67\x68\x69\x6a\x6b\x6c\x6d\x6e" \
27
+ "\x6f\x70\x71\x72\x73\x74\x75\x76\x77\x78" \
28
+ "\x79\x7a"
29
+ PUNCT = "\x21\x22\x23\x24\x25\x26\x27\x28\x29\x2a" \
30
+ "\x2b\x2c\x2d\x2e\x2f\x3a\x3b\x3c\x3d\x3e" \
31
+ "\x3f\x40\x5b\x5c\x5d\x5e\x5f\x60\x7b\x7c" \
32
+ "\x7d\x7e"
33
+ LOWER = "\x61\x62\x63\x64\x65\x66\x67\x68\x69\x6a" \
34
+ "\x6b\x6c\x6d\x6e\x6f\x70\x71\x72\x73\x74" \
35
+ "\x75\x76\x77\x78\x79\x7a"
36
+ UPPER = "\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4a" \
37
+ "\x4b\x4c\x4d\x4e\x4f\x50\x51\x52\x53\x54" \
38
+ "\x55\x56\x57\x58\x59\x5a"
39
+ PRINT = "\x20\x21\x22\x23\x24\x25\x26\x27\x28\x29" \
40
+ "\x2a\x2b\x2c\x2d\x2e\x2f\x30\x31\x32\x33" \
41
+ "\x34\x35\x36\x37\x38\x39\x3a\x3b\x3c\x3d" \
42
+ "\x3e\x3f\x40\x41\x42\x43\x44\x45\x46\x47" \
43
+ "\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f\x50\x51" \
44
+ "\x52\x53\x54\x55\x56\x57\x58\x59\x5a\x5b" \
45
+ "\x5c\x5d\x5e\x5f\x60\x61\x62\x63\x64\x65" \
46
+ "\x66\x67\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f" \
47
+ "\x70\x71\x72\x73\x74\x75\x76\x77\x78\x79" \
48
+ "\x7a\x7b\x7c\x7d\x7e"
49
+ GRAPH = "\x21\x22\x23\x24\x25\x26\x27\x28\x29\x2a" \
50
+ "\x2b\x2c\x2d\x2e\x2f\x30\x31\x32\x33\x34" \
51
+ "\x35\x36\x37\x38\x39\x3a\x3b\x3c\x3d\x3e" \
52
+ "\x3f\x40\x41\x42\x43\x44\x45\x46\x47\x48" \
53
+ "\x49\x4a\x4b\x4c\x4d\x4e\x4f\x50\x51\x52" \
54
+ "\x53\x54\x55\x56\x57\x58\x59\x5a\x5b\x5c" \
55
+ "\x5d\x5e\x5f\x60\x61\x62\x63\x64\x65\x66" \
56
+ "\x67\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f\x70" \
57
+ "\x71\x72\x73\x74\x75\x76\x77\x78\x79\x7a" \
58
+ "\x7b\x7c\x7d\x7e"
59
+ XDIGIT = "\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39" \
60
+ "\x41\x42\x43\x44\x45\x46\x61\x62\x63\x64" \
61
+ "\x65\x66"
62
+
63
+ JA_BLANK = "" # kludge...
64
+ JA_GRAPH = "" # kludge...
65
+
66
+ PUNCT.replace(Regexp.quote(PUNCT)) # kludge to avoid warning "character class has `[' without escape"
67
+ PRINT.replace(Regexp.quote(PRINT)) # kludge to avoid warning "character class has `[' without escape"
68
+ GRAPH.replace(Regexp.quote(GRAPH)) # kludge to avoid warning "character class has `[' without escape"
69
+
70
+ WORD_REGEXP_SRC = ["(?:[#{GRAPH}]+[#{BLANK}]?)",
71
+ "|(?:[#{SPACE}]+)",
72
+ "|(?:.+?)"].join
73
+
74
+ # override default method, as ASCII has no Japanese in it
75
+ def count_ja_graph_char()
76
+ 0
77
+ end
78
+
79
+ # override default method, as ASCII has no Japanese in it
80
+ def count_ja_blank_char()
81
+ 0
82
+ end
83
+
84
+ # override default method, as ASCII has no Japanese in it
85
+ def count_ja_word()
86
+ 0
87
+ end
88
+
89
+ # override default method, as ASCII has no Japanese in it
90
+ def count_ja_valid_word()
91
+ 0
92
+ end
93
+
94
+ CharString.register_encoding(self)
95
+
96
+ end # module ASCII
97
+ end
@@ -0,0 +1,269 @@
1
+ # Japanese EUC-JP encoding module for CharString
2
+ # 2003- Hisashi MORITA
3
+
4
+ module CharString
5
+ module EUC_JP
6
+
7
+ Encoding = "EUC-JP"
8
+
9
+ # character table based on:
10
+ # ftp://ftp.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/JIS0208.TXT
11
+
12
+ CNTRL = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09" \
13
+ "\x0a\x0b\x0c\x0d\x0e\x0f\x10\x11\x12\x13" \
14
+ "\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d" \
15
+ "\x1e\x1f\x7f"
16
+ SPACE = "\x09\x0a\x0b\x0c\x0d\x20"
17
+ BLANK = "\x09\x20"
18
+ DIGIT = "\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39"
19
+ ALPHA = "\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4a" \
20
+ "\x4b\x4c\x4d\x4e\x4f\x50\x51\x52\x53\x54" \
21
+ "\x55\x56\x57\x58\x59\x5a\x61\x62\x63\x64" \
22
+ "\x65\x66\x67\x68\x69\x6a\x6b\x6c\x6d\x6e" \
23
+ "\x6f\x70\x71\x72\x73\x74\x75\x76\x77\x78" \
24
+ "\x79\x7a"
25
+ ALNUM = "\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39" \
26
+ "\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4a" \
27
+ "\x4b\x4c\x4d\x4e\x4f\x50\x51\x52\x53\x54" \
28
+ "\x55\x56\x57\x58\x59\x5a\x61\x62\x63\x64" \
29
+ "\x65\x66\x67\x68\x69\x6a\x6b\x6c\x6d\x6e" \
30
+ "\x6f\x70\x71\x72\x73\x74\x75\x76\x77\x78" \
31
+ "\x79\x7a"
32
+ PUNCT = "\x21\x22\x23\x24\x25\x26\x27\x28\x29\x2a" \
33
+ "\x2b\x2c\x2d\x2e\x2f\x3a\x3b\x3c\x3d\x3e" \
34
+ "\x3f\x40\x5b\x5c\x5d\x5e\x5f\x60\x7b\x7c" \
35
+ "\x7d\x7e"
36
+ LOWER = "\x61\x62\x63\x64\x65\x66\x67\x68\x69\x6a" \
37
+ "\x6b\x6c\x6d\x6e\x6f\x70\x71\x72\x73\x74" \
38
+ "\x75\x76\x77\x78\x79\x7a"
39
+ UPPER = "\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4a" \
40
+ "\x4b\x4c\x4d\x4e\x4f\x50\x51\x52\x53\x54" \
41
+ "\x55\x56\x57\x58\x59\x5a"
42
+ PRINT = "\x20\x21\x22\x23\x24\x25\x26\x27\x28\x29" \
43
+ "\x2a\x2b\x2c\x2d\x2e\x2f\x30\x31\x32\x33" \
44
+ "\x34\x35\x36\x37\x38\x39\x3a\x3b\x3c\x3d" \
45
+ "\x3e\x3f\x40\x41\x42\x43\x44\x45\x46\x47" \
46
+ "\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f\x50\x51" \
47
+ "\x52\x53\x54\x55\x56\x57\x58\x59\x5a\x5b" \
48
+ "\x5c\x5d\x5e\x5f\x60\x61\x62\x63\x64\x65" \
49
+ "\x66\x67\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f" \
50
+ "\x70\x71\x72\x73\x74\x75\x76\x77\x78\x79" \
51
+ "\x7a\x7b\x7c\x7d\x7e"
52
+ GRAPH = "\x21\x22\x23\x24\x25\x26\x27\x28\x29\x2a" \
53
+ "\x2b\x2c\x2d\x2e\x2f\x30\x31\x32\x33\x34" \
54
+ "\x35\x36\x37\x38\x39\x3a\x3b\x3c\x3d\x3e" \
55
+ "\x3f\x40\x41\x42\x43\x44\x45\x46\x47\x48" \
56
+ "\x49\x4a\x4b\x4c\x4d\x4e\x4f\x50\x51\x52" \
57
+ "\x53\x54\x55\x56\x57\x58\x59\x5a\x5b\x5c" \
58
+ "\x5d\x5e\x5f\x60\x61\x62\x63\x64\x65\x66" \
59
+ "\x67\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f\x70" \
60
+ "\x71\x72\x73\x74\x75\x76\x77\x78\x79\x7a" \
61
+ "\x7b\x7c\x7d\x7e"
62
+ XDIGIT = "\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39" \
63
+ "\x41\x42\x43\x44\x45\x46\x61\x62\x63\x64" \
64
+ "\x65\x66"
65
+ JA_SPACE = "\xa1\xa1"
66
+ JA_BLANK = "\xa1\xa1"
67
+ JA_ALNUM = "\xa3\xb0\xa3\xb1\xa3\xb2\xa3\xb3\xa3\xb4" \
68
+ "\xa3\xb5\xa3\xb6\xa3\xb7\xa3\xb8\xa3\xb9" \
69
+ "\xa3\xc1\xa3\xc2\xa3\xc3\xa3\xc4\xa3\xc5" \
70
+ "\xa3\xc6\xa3\xc7\xa3\xc8\xa3\xc9\xa3\xca" \
71
+ "\xa3\xcb\xa3\xcc\xa3\xcd\xa3\xce\xa3\xcf" \
72
+ "\xa3\xd0\xa3\xd1\xa3\xd2\xa3\xd3\xa3\xd4" \
73
+ "\xa3\xd5\xa3\xd6\xa3\xd7\xa3\xd8\xa3\xd9" \
74
+ "\xa3\xda\xa3\xe1\xa3\xe2\xa3\xe3\xa3\xe4" \
75
+ "\xa3\xe5\xa3\xe6\xa3\xe7\xa3\xe8\xa3\xe9" \
76
+ "\xa3\xea\xa3\xeb\xa3\xec\xa3\xed\xa3\xee" \
77
+ "\xa3\xef\xa3\xf0\xa3\xf1\xa3\xf2\xa3\xf3" \
78
+ "\xa3\xf4\xa3\xf5\xa3\xf6\xa3\xf7\xa3\xf8" \
79
+ "\xa3\xf9\xa3\xfa"
80
+ JA_PUNCT = "\xa1\xa2\xa1\xa3\xa1\xa4\xa1\xa5\xa1\xa6" \
81
+ "\xa1\xa7\xa1\xa8\xa1\xa9\xa1\xaa\xa1\xab" \
82
+ "\xa1\xac\xa1\xad\xa1\xae\xa1\xaf\xa1\xb0" \
83
+ "\xa1\xb1\xa1\xb2\xa1\xb3\xa1\xb4\xa1\xb5" \
84
+ "\xa1\xb6\xa1\xb7\xa1\xb8\xa1\xb9\xa1\xba" \
85
+ "\xa1\xbb\xa1\xbc\xa1\xbd\xa1\xbe\xa1\xbf" \
86
+ "\xa1\xc0\xa1\xc1\xa1\xc2\xa1\xc3\xa1\xc4" \
87
+ "\xa1\xc5\xa1\xc6\xa1\xc7\xa1\xc8\xa1\xc9" \
88
+ "\xa1\xca\xa1\xcb\xa1\xcc\xa1\xcd\xa1\xce" \
89
+ "\xa1\xcf\xa1\xd0\xa1\xd1\xa1\xd2\xa1\xd3" \
90
+ "\xa1\xd4\xa1\xd5\xa1\xd6\xa1\xd7\xa1\xd8" \
91
+ "\xa1\xd9\xa1\xda\xa1\xdb\xa1\xdc\xa1\xdd" \
92
+ "\xa1\xde\xa1\xdf\xa1\xe0\xa1\xe1\xa1\xe2" \
93
+ "\xa1\xe3\xa1\xe4\xa1\xe5\xa1\xe6\xa1\xe7" \
94
+ "\xa1\xe8\xa1\xe9\xa1\xea\xa1\xeb\xa1\xec" \
95
+ "\xa1\xed\xa1\xee\xa1\xef\xa1\xf0\xa1\xf1" \
96
+ "\xa1\xf2\xa1\xf3\xa1\xf4\xa1\xf5\xa1\xf6" \
97
+ "\xa1\xf7\xa1\xf8\xa1\xf9\xa1\xfa\xa1\xfb" \
98
+ "\xa1\xfc\xa1\xfd\xa1\xfe\xa2\xa1\xa2\xa2" \
99
+ "\xa2\xa3\xa2\xa4\xa2\xa5\xa2\xa6\xa2\xa7" \
100
+ "\xa2\xa8\xa2\xa9\xa2\xaa\xa2\xab\xa2\xac" \
101
+ "\xa2\xad\xa2\xae\xa2\xba\xa2\xbb\xa2\xbc" \
102
+ "\xa2\xbd\xa2\xbe\xa2\xbf\xa2\xc0\xa2\xc1" \
103
+ "\xa2\xca\xa2\xcb\xa2\xcc\xa2\xcd\xa2\xce" \
104
+ "\xa2\xcf\xa2\xd0\xa2\xdc\xa2\xdd\xa2\xde" \
105
+ "\xa2\xdf\xa2\xe0\xa2\xe1\xa2\xe2\xa2\xe3" \
106
+ "\xa2\xe4\xa2\xe5\xa2\xe6\xa2\xe7\xa2\xe8" \
107
+ "\xa2\xe9\xa2\xea\xa2\xf2\xa2\xf3\xa2\xf4" \
108
+ "\xa2\xf5\xa2\xf6\xa2\xf7\xa2\xf8\xa2\xf9" \
109
+ "\xa2\xfe\xa6\xa1\xa6\xa2\xa6\xa3\xa6\xa4" \
110
+ "\xa6\xa5\xa6\xa6\xa6\xa7\xa6\xa8\xa6\xa9" \
111
+ "\xa6\xaa\xa6\xab\xa6\xac\xa6\xad\xa6\xae" \
112
+ "\xa6\xaf\xa6\xb0\xa6\xb1\xa6\xb2\xa6\xb3" \
113
+ "\xa6\xb4\xa6\xb5\xa6\xb6\xa6\xb7\xa6\xb8" \
114
+ "\xa6\xc1\xa6\xc2\xa6\xc3\xa6\xc4\xa6\xc5" \
115
+ "\xa6\xc6\xa6\xc7\xa6\xc8\xa6\xc9\xa6\xca" \
116
+ "\xa6\xcb\xa6\xcc\xa6\xcd\xa6\xce\xa6\xcf" \
117
+ "\xa6\xd0\xa6\xd1\xa6\xd2\xa6\xd3\xa6\xd4" \
118
+ "\xa6\xd5\xa6\xd6\xa6\xd7\xa6\xd8\xa7\xa1" \
119
+ "\xa7\xa2\xa7\xa3\xa7\xa4\xa7\xa5\xa7\xa6" \
120
+ "\xa7\xa7\xa7\xa8\xa7\xa9\xa7\xaa\xa7\xab" \
121
+ "\xa7\xac\xa7\xad\xa7\xae\xa7\xaf\xa7\xb0" \
122
+ "\xa7\xb1\xa7\xb2\xa7\xb3\xa7\xb4\xa7\xb5" \
123
+ "\xa7\xb6\xa7\xb7\xa7\xb8\xa7\xb9\xa7\xba" \
124
+ "\xa7\xbb\xa7\xbc\xa7\xbd\xa7\xbe\xa7\xbf" \
125
+ "\xa7\xc0\xa7\xc1\xa7\xd1\xa7\xd2\xa7\xd3" \
126
+ "\xa7\xd4\xa7\xd5\xa7\xd6\xa7\xd7\xa7\xd8" \
127
+ "\xa7\xd9\xa7\xda\xa7\xdb\xa7\xdc\xa7\xdd" \
128
+ "\xa7\xde\xa7\xdf\xa7\xe0\xa7\xe1\xa7\xe2" \
129
+ "\xa7\xe3\xa7\xe4\xa7\xe5\xa7\xe6\xa7\xe7" \
130
+ "\xa7\xe8\xa7\xe9\xa7\xea\xa7\xeb\xa7\xec" \
131
+ "\xa7\xed\xa7\xee\xa7\xef\xa7\xf0\xa7\xf1" \
132
+ "\xa8\xa1\xa8\xa2\xa8\xa3\xa8\xa4\xa8\xa5" \
133
+ "\xa8\xa6\xa8\xa7\xa8\xa8\xa8\xa9\xa8\xaa" \
134
+ "\xa8\xab\xa8\xac\xa8\xad\xa8\xae\xa8\xaf" \
135
+ "\xa8\xb0\xa8\xb1\xa8\xb2\xa8\xb3\xa8\xb4" \
136
+ "\xa8\xb5\xa8\xb6\xa8\xb7\xa8\xb8\xa8\xb9" \
137
+ "\xa8\xba\xa8\xbb\xa8\xbc\xa8\xbd\xa8\xbe" \
138
+ "\xa8\xbf\xa8\xc0"
139
+ HIRA = "\xa4\xa1\xa4\xa2\xa4\xa3\xa4\xa4\xa4\xa5" \
140
+ "\xa4\xa6\xa4\xa7\xa4\xa8\xa4\xa9\xa4\xaa" \
141
+ "\xa4\xab\xa4\xac\xa4\xad\xa4\xae\xa4\xaf" \
142
+ "\xa4\xb0\xa4\xb1\xa4\xb2\xa4\xb3\xa4\xb4" \
143
+ "\xa4\xb5\xa4\xb6\xa4\xb7\xa4\xb8\xa4\xb9" \
144
+ "\xa4\xba\xa4\xbb\xa4\xbc\xa4\xbd\xa4\xbe" \
145
+ "\xa4\xbf\xa4\xc0\xa4\xc1\xa4\xc2\xa4\xc3" \
146
+ "\xa4\xc4\xa4\xc5\xa4\xc6\xa4\xc7\xa4\xc8" \
147
+ "\xa4\xc9\xa4\xca\xa4\xcb\xa4\xcc\xa4\xcd" \
148
+ "\xa4\xce\xa4\xcf\xa4\xd0\xa4\xd1\xa4\xd2" \
149
+ "\xa4\xd3\xa4\xd4\xa4\xd5\xa4\xd6\xa4\xd7" \
150
+ "\xa4\xd8\xa4\xd9\xa4\xda\xa4\xdb\xa4\xdc" \
151
+ "\xa4\xdd\xa4\xde\xa4\xdf\xa4\xe0\xa4\xe1" \
152
+ "\xa4\xe2\xa4\xe3\xa4\xe4\xa4\xe5\xa4\xe6" \
153
+ "\xa4\xe7\xa4\xe8\xa4\xe9\xa4\xea\xa4\xeb" \
154
+ "\xa4\xec\xa4\xed\xa4\xee\xa4\xef\xa4\xf0" \
155
+ "\xa4\xf1\xa4\xf2\xa4\xf3" # [\xa4\xa1-\xa4\xf3]
156
+ HIRA_EX = HIRA +
157
+ "\xa1\xbc" +
158
+ "\xa1\xb3\xa1\xb4" # add onbiki and kanagaeshi(hira)
159
+ KATA = "\xa5\xa1\xa5\xa2\xa5\xa3\xa5\xa4\xa5\xa5" \
160
+ "\xa5\xa6\xa5\xa7\xa5\xa8\xa5\xa9\xa5\xaa" \
161
+ "\xa5\xab\xa5\xac\xa5\xad\xa5\xae\xa5\xaf" \
162
+ "\xa5\xb0\xa5\xb1\xa5\xb2\xa5\xb3\xa5\xb4" \
163
+ "\xa5\xb5\xa5\xb6\xa5\xb7\xa5\xb8\xa5\xb9" \
164
+ "\xa5\xba\xa5\xbb\xa5\xbc\xa5\xbd\xa5\xbe" \
165
+ "\xa5\xbf\xa5\xc0\xa5\xc1\xa5\xc2\xa5\xc3" \
166
+ "\xa5\xc4\xa5\xc5\xa5\xc6\xa5\xc7\xa5\xc8" \
167
+ "\xa5\xc9\xa5\xca\xa5\xcb\xa5\xcc\xa5\xcd" \
168
+ "\xa5\xce\xa5\xcf\xa5\xd0\xa5\xd1\xa5\xd2" \
169
+ "\xa5\xd3\xa5\xd4\xa5\xd5\xa5\xd6\xa5\xd7" \
170
+ "\xa5\xd8\xa5\xd9\xa5\xda\xa5\xdb\xa5\xdc" \
171
+ "\xa5\xdd\xa5\xde\xa5\xdf\xa5\xe0\xa5\xe1" \
172
+ "\xa5\xe2\xa5\xe3\xa5\xe4\xa5\xe5\xa5\xe6" \
173
+ "\xa5\xe7\xa5\xe8\xa5\xe9\xa5\xea\xa5\xeb" \
174
+ "\xa5\xec\xa5\xed\xa5\xee\xa5\xef\xa5\xf0" \
175
+ "\xa5\xf1\xa5\xf2\xa5\xf3\xa5\xf4\xa5\xf5" \
176
+ "\xa5\xf6" # [\xa5\xa1-\xa5\xf6]
177
+ KATA_EX = KATA +
178
+ "\xa1\xbc" +
179
+ "\xa1\xb5\xa1\xb6" # add onbiki and kanagaeshi(kata)
180
+ KANJI = "\xb0\xa1-\xb0\xfe" \
181
+ "\xb1\xa1-\xb1\xfe" \
182
+ "\xb2\xa1-\xb2\xfe" \
183
+ "\xb3\xa1-\xb3\xfe" \
184
+ "\xb4\xa1-\xb4\xfe" \
185
+ "\xb5\xa1-\xb5\xfe" \
186
+ "\xb6\xa1-\xb6\xfe" \
187
+ "\xb7\xa1-\xb7\xfe" \
188
+ "\xb8\xa1-\xb8\xfe" \
189
+ "\xb9\xa1-\xb9\xfe" \
190
+ "\xba\xa1-\xba\xfe" \
191
+ "\xbb\xa1-\xbb\xfe" \
192
+ "\xbc\xa1-\xbc\xfe" \
193
+ "\xbd\xa1-\xbd\xfe" \
194
+ "\xbe\xa1-\xbe\xfe" \
195
+ "\xbf\xa1-\xbf\xfe" \
196
+ "\xc0\xa1-\xc0\xfe" \
197
+ "\xc1\xa1-\xc1\xfe" \
198
+ "\xc2\xa1-\xc2\xfe" \
199
+ "\xc3\xa1-\xc3\xfe" \
200
+ "\xc4\xa1-\xc4\xfe" \
201
+ "\xc5\xa1-\xc5\xfe" \
202
+ "\xc6\xa1-\xc6\xfe" \
203
+ "\xc7\xa1-\xc7\xfe" \
204
+ "\xc8\xa1-\xc8\xfe" \
205
+ "\xc9\xa1-\xc9\xfe" \
206
+ "\xca\xa1-\xca\xfe" \
207
+ "\xcb\xa1-\xcb\xfe" \
208
+ "\xcc\xa1-\xcc\xfe" \
209
+ "\xcd\xa1-\xcd\xfe" \
210
+ "\xce\xa1-\xce\xfe" \
211
+ "\xcf\xa1-\xcf\xd3" \
212
+ "\xd0\xa1-\xd0\xfe" \
213
+ "\xd1\xa1-\xd1\xfe" \
214
+ "\xd2\xa1-\xd2\xfe" \
215
+ "\xd3\xa1-\xd3\xfe" \
216
+ "\xd4\xa1-\xd4\xfe" \
217
+ "\xd5\xa1-\xd5\xfe" \
218
+ "\xd6\xa1-\xd6\xfe" \
219
+ "\xd7\xa1-\xd7\xfe" \
220
+ "\xd8\xa1-\xd8\xfe" \
221
+ "\xd9\xa1-\xd9\xfe" \
222
+ "\xda\xa1-\xda\xfe" \
223
+ "\xdb\xa1-\xdb\xfe" \
224
+ "\xdc\xa1-\xdc\xfe" \
225
+ "\xdd\xa1-\xdd\xfe" \
226
+ "\xde\xa1-\xde\xfe" \
227
+ "\xdf\xa1-\xdf\xfe" \
228
+ "\xe0\xa1-\xe0\xfe" \
229
+ "\xe1\xa1-\xe1\xfe" \
230
+ "\xe2\xa1-\xe2\xfe" \
231
+ "\xe3\xa1-\xe3\xfe" \
232
+ "\xe4\xa1-\xe4\xfe" \
233
+ "\xe5\xa1-\xe5\xfe" \
234
+ "\xe6\xa1-\xe6\xfe" \
235
+ "\xe7\xa1-\xe7\xfe" \
236
+ "\xe8\xa1-\xe8\xfe" \
237
+ "\xe9\xa1-\xe9\xfe" \
238
+ "\xea\xa1-\xea\xfe" \
239
+ "\xeb\xa1-\xeb\xfe" \
240
+ "\xec\xa1-\xec\xfe" \
241
+ "\xed\xa1-\xed\xfe" \
242
+ "\xee\xa1-\xee\xfe" \
243
+ "\xef\xa1-\xef\xfe" \
244
+ "\xf0\xa1-\xf0\xfe" \
245
+ "\xf1\xa1-\xf1\xfe" \
246
+ "\xf2\xa1-\xf2\xfe" \
247
+ "\xf3\xa1-\xf3\xfe" \
248
+ "\xf4\xa1-\xf4\xa6"
249
+ KANJI_EX = KANJI + "\xa1\xb9" # + noma
250
+ JA_GRAPH = JA_ALNUM + JA_PUNCT + HIRA + KATA + KANJI
251
+ JA_PRINT = JA_GRAPH + JA_BLANK
252
+
253
+ PUNCT.replace(Regexp.quote(PUNCT)) # kludge to avoid warning "character class has `[' without escape"
254
+ PRINT.replace(Regexp.quote(PRINT)) # kludge to avoid warning "character class has `[' without escape"
255
+ GRAPH.replace(Regexp.quote(GRAPH)) # kludge to avoid warning "character class has `[' without escape"
256
+
257
+ WORD_REGEXP_SRC = ["(?:[#{GRAPH}]+[#{BLANK}]?)",
258
+ "|(?:[#{SPACE}]+)",
259
+ "|(?:[#{KANJI_EX}]+[#{HIRA}]+)",
260
+ "|(?:[#{KATA_EX}]+[#{HIRA}]+)",
261
+ "|(?:[#{KANJI_EX}]+)",
262
+ "|(?:[#{KATA_EX}]+)",
263
+ "|(?:[#{HIRA_EX}]+)",
264
+ "|(?:.+?)"].join
265
+
266
+ CharString.register_encoding(self)
267
+
268
+ end # module EUCJP
269
+ end