docdiff 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. data/.gitignore +6 -0
  2. data/.travis.yml +7 -0
  3. data/Gemfile +17 -0
  4. data/Guardfile +8 -0
  5. data/Makefile +108 -0
  6. data/Rakefile +17 -0
  7. data/bin/docdiff +179 -0
  8. data/devutil/JIS0208.TXT +6952 -0
  9. data/devutil/char_by_charclass.rb +23 -0
  10. data/devutil/charclass_by_char.rb +21 -0
  11. data/devutil/jis0208.rb +343 -0
  12. data/devutil/testjis0208.rb +38 -0
  13. data/docdiff.conf.example +22 -0
  14. data/docdiff.gemspec +23 -0
  15. data/docdiffwebui.cgi +176 -0
  16. data/docdiffwebui.html +123 -0
  17. data/img/docdiff-screenshot-format-html-digest-firefox.png +0 -0
  18. data/img/docdiff-screenshot-format-html-firefox.png +0 -0
  19. data/img/docdiff-screenshot-format-tty-cmdexe-en.png +0 -0
  20. data/img/docdiff-screenshot-format-tty-cmdexe-ja.png +0 -0
  21. data/img/docdiff-screenshot-format-tty-rxvtunicode-en.png +0 -0
  22. data/img/docdiff-screenshot-format-tty-rxvtunicode-ja.png +0 -0
  23. data/img/docdiff-screenshot-format-tty-xterm-en.png +0 -0
  24. data/img/docdiff-screenshot-format-tty-xterm-ja.png +0 -0
  25. data/img/docdiff-screenshot-resolution-linewordchar-xterm.png +0 -0
  26. data/index.html +181 -0
  27. data/langfilter.rb +14 -0
  28. data/lib/doc_diff.rb +170 -0
  29. data/lib/docdiff.rb +7 -0
  30. data/lib/docdiff/charstring.rb +579 -0
  31. data/lib/docdiff/diff.rb +217 -0
  32. data/lib/docdiff/diff/contours.rb +382 -0
  33. data/lib/docdiff/diff/editscript.rb +148 -0
  34. data/lib/docdiff/diff/rcsdiff.rb +107 -0
  35. data/lib/docdiff/diff/shortestpath.rb +93 -0
  36. data/lib/docdiff/diff/speculative.rb +40 -0
  37. data/lib/docdiff/diff/subsequence.rb +39 -0
  38. data/lib/docdiff/diff/unidiff.rb +124 -0
  39. data/lib/docdiff/difference.rb +92 -0
  40. data/lib/docdiff/document.rb +127 -0
  41. data/lib/docdiff/encoding/en_ascii.rb +97 -0
  42. data/lib/docdiff/encoding/ja_eucjp.rb +269 -0
  43. data/lib/docdiff/encoding/ja_sjis.rb +260 -0
  44. data/lib/docdiff/encoding/ja_utf8.rb +6974 -0
  45. data/lib/docdiff/version.rb +3 -0
  46. data/lib/docdiff/view.rb +476 -0
  47. data/lib/viewdiff.rb +375 -0
  48. data/readme.html +713 -0
  49. data/sample/01.en.ascii.cr +1 -0
  50. data/sample/01.en.ascii.crlf +2 -0
  51. data/sample/01.en.ascii.lf +2 -0
  52. data/sample/01.ja.eucjp.lf +2 -0
  53. data/sample/01.ja.sjis.cr +1 -0
  54. data/sample/01.ja.sjis.crlf +2 -0
  55. data/sample/01.ja.utf8.crlf +2 -0
  56. data/sample/02.en.ascii.cr +1 -0
  57. data/sample/02.en.ascii.crlf +2 -0
  58. data/sample/02.en.ascii.lf +2 -0
  59. data/sample/02.ja.eucjp.lf +2 -0
  60. data/sample/02.ja.sjis.cr +1 -0
  61. data/sample/02.ja.sjis.crlf +2 -0
  62. data/sample/02.ja.utf8.crlf +2 -0
  63. data/sample/humpty_dumpty01.ascii.lf +4 -0
  64. data/sample/humpty_dumpty02.ascii.lf +4 -0
  65. data/test/charstring_test.rb +1008 -0
  66. data/test/diff_test.rb +36 -0
  67. data/test/difference_test.rb +64 -0
  68. data/test/docdiff_test.rb +193 -0
  69. data/test/document_test.rb +626 -0
  70. data/test/test_helper.rb +7 -0
  71. data/test/view_test.rb +570 -0
  72. data/test/viewdiff_test.rb +908 -0
  73. metadata +129 -0
@@ -0,0 +1,92 @@
1
+ # Difference class for DocDiff
2
+ # 2003-03-24 ..
3
+ # Hisashi MORITA
4
+
5
+ require 'docdiff/diff'
6
+
7
+ class Difference < Array
8
+
9
+ # @resolution = nil # char, word, phrase, sentence, line, paragraph..
10
+ # @codeset = ''
11
+ # @eol_char = "\n"
12
+ # @source = 'source'
13
+ # @target = 'target'
14
+ # attr_accessor :resolution, :codeset, :eol_char, :source, :target
15
+
16
+ def initialize(array1 = nil, array2 = nil)
17
+ if (array1 == nil) && (array2 == nil)
18
+ return []
19
+ end
20
+ diff = Diff.new(array1, array2)
21
+ @raw_list = []
22
+ diff.ses.each{|block| # Diff::EditScript does not have each_with_index()
23
+ @raw_list << block
24
+ }
25
+ combine_del_add_to_change!()
26
+ end
27
+
28
+ def combine_del_add_to_change!()
29
+
30
+ @raw_list.each_with_index{|block, i|
31
+ case block.first
32
+ when :common_elt_elt
33
+ if i == 0 # first block
34
+ self << block
35
+ else # in-between or the last block
36
+ if @raw_list[i - 1].first == :del_elt # previous block was del
37
+ self << @raw_list[i - 1]
38
+ self << block
39
+ else # previous block was add
40
+ self << block
41
+ end
42
+ end
43
+ when :del_elt
44
+ if i == (@raw_list.size - 1) # last block
45
+ self << block
46
+ else # first block or in-between
47
+ # do nothing, let the next block to decide what to do
48
+ end
49
+ when :add_elt
50
+ if i == 0 # first block
51
+ self << block
52
+ else # in-between or the last block
53
+ if @raw_list[i - 1].first == :del_elt # previous block was del
54
+ deleted = @raw_list[i - 1][1]
55
+ added = @raw_list[i][2]
56
+ self << [:change_elt, deleted, added]
57
+ else # previous block was common
58
+ self << block
59
+ end
60
+ end
61
+ else
62
+ raise "the first element of the block #{i} is invalid: (#{block.first})\n"
63
+ end
64
+ }
65
+ end
66
+ attr_accessor :raw_list
67
+
68
+ def former_only()
69
+ elms = self.dup.delete_if{|e| e[0] == :add_elt}
70
+ elms.collect!{|e|
71
+ if e[0] == :change_elt
72
+ [e[0], e[1], nil]
73
+ else
74
+ e
75
+ end
76
+ }
77
+ return elms
78
+ end
79
+
80
+ def latter_only()
81
+ elms = self.dup.delete_if{|e| e[0] == :del_elt}
82
+ elms.collect!{|e|
83
+ if e[0] == :change_elt
84
+ [e[0], nil, e[2]]
85
+ else
86
+ e
87
+ end
88
+ }
89
+ return elms
90
+ end
91
+
92
+ end # class Difference
@@ -0,0 +1,127 @@
1
+ # Document class, a part of DocDiff
2
+ # 2004-01-14.. Hisashi MORITA
3
+
4
+ require 'docdiff/charstring'
5
+
6
+ class EncodingDetectionFailure < Exception
7
+ end
8
+ class EOLDetectionFailure < Exception
9
+ end
10
+
11
+ class Document
12
+
13
+ def initialize(str, enc = nil, e = nil)
14
+ @body = str
15
+ @body.extend CharString
16
+ if enc
17
+ @body.encoding = enc
18
+ elsif !@body.encoding
19
+ guessed_encoding = CharString.guess_encoding(str)
20
+ if guessed_encoding == "UNKNOWN"
21
+ raise EncodingDetectionFailure, "encoding not specified, and auto detection failed."
22
+ # @body.encoding = 'ASCII' # default to ASCII <= BAD!
23
+ else
24
+ @body.encoding = guessed_encoding
25
+ end
26
+ end
27
+ if e
28
+ @body.eol = e
29
+ else
30
+ guessed_eol = CharString.guess_eol(str)
31
+ if guessed_eol == "UNKNOWN"
32
+ raise EOLDetectionFailure, "eol not specified, and auto detection failed."
33
+ # @body.eol = 'LF' # default to LF
34
+ else
35
+ @body.eol = guessed_eol
36
+ end
37
+ end
38
+ end
39
+ def encoding()
40
+ @body.encoding
41
+ end
42
+ def encoding=(cs)
43
+ @body.encoding = cs
44
+ end
45
+ def eol()
46
+ @body.eol
47
+ end
48
+ def eol=(eolstr)
49
+ @body.eol = eolstr
50
+ end
51
+
52
+ def split_to_line()
53
+ @body.split_to_line
54
+ end
55
+ def split_to_word()
56
+ @body.split_to_word
57
+ end
58
+ def split_to_char()
59
+ @body.split_to_char
60
+ end
61
+ def split_to_byte()
62
+ @body.split_to_byte
63
+ end
64
+
65
+ def count_line()
66
+ @body.count_line
67
+ end
68
+ def count_blank_line()
69
+ @body.count_blank_line
70
+ end
71
+ def count_empty_line()
72
+ @body.count_empty_line
73
+ end
74
+ def count_graph_line()
75
+ @body.count_graph_line
76
+ end
77
+
78
+ def count_word()
79
+ @body.count_word
80
+ end
81
+ def count_latin_word()
82
+ @body.count_latin_word
83
+ end
84
+ def count_ja_word()
85
+ @body.count_ja_word
86
+ end
87
+ def count_valid_word()
88
+ @body.count_valid_word
89
+ end
90
+ def count_latin_valid_word()
91
+ @body.count_latin_valid_word
92
+ end
93
+ def count_ja_valid_word()
94
+ @body.count_ja_valid_word
95
+ end
96
+
97
+ def count_char()
98
+ @body.count_char
99
+ end
100
+ def count_blank_char()
101
+ @body.count_blank_char
102
+ end
103
+ def count_graph_char()
104
+ @body.count_graph_char
105
+ end
106
+ def count_latin_blank_char()
107
+ @body.count_latin_blank_char
108
+ end
109
+ def count_latin_graph_char()
110
+ @body.count_latin_graph_char
111
+ end
112
+ def count_ja_blank_char()
113
+ @body.count_ja_blank_char
114
+ end
115
+ def count_ja_graph_char()
116
+ @body.count_ja_graph_char
117
+ end
118
+
119
+ def count_byte()
120
+ @body.count_byte
121
+ end
122
+
123
+ def eol_char()
124
+ @body.eol_char
125
+ end
126
+
127
+ end # class Document
@@ -0,0 +1,97 @@
1
+ # English ASCII encoding module for CharString
2
+ # 2003- Hisashi MORITA
3
+
4
+ module CharString
5
+ module ASCII
6
+
7
+ Encoding = "US-ASCII"
8
+
9
+ CNTRL = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09" \
10
+ "\x0a\x0b\x0c\x0d\x0e\x0f\x10\x11\x12\x13" \
11
+ "\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d" \
12
+ "\x1e\x1f\x7f"
13
+ SPACE = "\x09\x0a\x0b\x0c\x0d\x20"
14
+ BLANK = "\x09\x20"
15
+ DIGIT = "\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39"
16
+ ALPHA = "\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4a" \
17
+ "\x4b\x4c\x4d\x4e\x4f\x50\x51\x52\x53\x54" \
18
+ "\x55\x56\x57\x58\x59\x5a\x61\x62\x63\x64" \
19
+ "\x65\x66\x67\x68\x69\x6a\x6b\x6c\x6d\x6e" \
20
+ "\x6f\x70\x71\x72\x73\x74\x75\x76\x77\x78" \
21
+ "\x79\x7a"
22
+ ALNUM = "\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39" \
23
+ "\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4a" \
24
+ "\x4b\x4c\x4d\x4e\x4f\x50\x51\x52\x53\x54" \
25
+ "\x55\x56\x57\x58\x59\x5a\x61\x62\x63\x64" \
26
+ "\x65\x66\x67\x68\x69\x6a\x6b\x6c\x6d\x6e" \
27
+ "\x6f\x70\x71\x72\x73\x74\x75\x76\x77\x78" \
28
+ "\x79\x7a"
29
+ PUNCT = "\x21\x22\x23\x24\x25\x26\x27\x28\x29\x2a" \
30
+ "\x2b\x2c\x2d\x2e\x2f\x3a\x3b\x3c\x3d\x3e" \
31
+ "\x3f\x40\x5b\x5c\x5d\x5e\x5f\x60\x7b\x7c" \
32
+ "\x7d\x7e"
33
+ LOWER = "\x61\x62\x63\x64\x65\x66\x67\x68\x69\x6a" \
34
+ "\x6b\x6c\x6d\x6e\x6f\x70\x71\x72\x73\x74" \
35
+ "\x75\x76\x77\x78\x79\x7a"
36
+ UPPER = "\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4a" \
37
+ "\x4b\x4c\x4d\x4e\x4f\x50\x51\x52\x53\x54" \
38
+ "\x55\x56\x57\x58\x59\x5a"
39
+ PRINT = "\x20\x21\x22\x23\x24\x25\x26\x27\x28\x29" \
40
+ "\x2a\x2b\x2c\x2d\x2e\x2f\x30\x31\x32\x33" \
41
+ "\x34\x35\x36\x37\x38\x39\x3a\x3b\x3c\x3d" \
42
+ "\x3e\x3f\x40\x41\x42\x43\x44\x45\x46\x47" \
43
+ "\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f\x50\x51" \
44
+ "\x52\x53\x54\x55\x56\x57\x58\x59\x5a\x5b" \
45
+ "\x5c\x5d\x5e\x5f\x60\x61\x62\x63\x64\x65" \
46
+ "\x66\x67\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f" \
47
+ "\x70\x71\x72\x73\x74\x75\x76\x77\x78\x79" \
48
+ "\x7a\x7b\x7c\x7d\x7e"
49
+ GRAPH = "\x21\x22\x23\x24\x25\x26\x27\x28\x29\x2a" \
50
+ "\x2b\x2c\x2d\x2e\x2f\x30\x31\x32\x33\x34" \
51
+ "\x35\x36\x37\x38\x39\x3a\x3b\x3c\x3d\x3e" \
52
+ "\x3f\x40\x41\x42\x43\x44\x45\x46\x47\x48" \
53
+ "\x49\x4a\x4b\x4c\x4d\x4e\x4f\x50\x51\x52" \
54
+ "\x53\x54\x55\x56\x57\x58\x59\x5a\x5b\x5c" \
55
+ "\x5d\x5e\x5f\x60\x61\x62\x63\x64\x65\x66" \
56
+ "\x67\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f\x70" \
57
+ "\x71\x72\x73\x74\x75\x76\x77\x78\x79\x7a" \
58
+ "\x7b\x7c\x7d\x7e"
59
+ XDIGIT = "\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39" \
60
+ "\x41\x42\x43\x44\x45\x46\x61\x62\x63\x64" \
61
+ "\x65\x66"
62
+
63
+ JA_BLANK = "" # kludge...
64
+ JA_GRAPH = "" # kludge...
65
+
66
+ PUNCT.replace(Regexp.quote(PUNCT)) # kludge to avoid warning "character class has `[' without escape"
67
+ PRINT.replace(Regexp.quote(PRINT)) # kludge to avoid warning "character class has `[' without escape"
68
+ GRAPH.replace(Regexp.quote(GRAPH)) # kludge to avoid warning "character class has `[' without escape"
69
+
70
+ WORD_REGEXP_SRC = ["(?:[#{GRAPH}]+[#{BLANK}]?)",
71
+ "|(?:[#{SPACE}]+)",
72
+ "|(?:.+?)"].join
73
+
74
+ # override default method, as ASCII has no Japanese in it
75
+ def count_ja_graph_char()
76
+ 0
77
+ end
78
+
79
+ # override default method, as ASCII has no Japanese in it
80
+ def count_ja_blank_char()
81
+ 0
82
+ end
83
+
84
+ # override default method, as ASCII has no Japanese in it
85
+ def count_ja_word()
86
+ 0
87
+ end
88
+
89
+ # override default method, as ASCII has no Japanese in it
90
+ def count_ja_valid_word()
91
+ 0
92
+ end
93
+
94
+ CharString.register_encoding(self)
95
+
96
+ end # module ASCII
97
+ end
@@ -0,0 +1,269 @@
1
+ # Japanese EUC-JP encoding module for CharString
2
+ # 2003- Hisashi MORITA
3
+
4
+ module CharString
5
+ module EUC_JP
6
+
7
+ Encoding = "EUC-JP"
8
+
9
+ # character table based on:
10
+ # ftp://ftp.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/JIS0208.TXT
11
+
12
+ CNTRL = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09" \
13
+ "\x0a\x0b\x0c\x0d\x0e\x0f\x10\x11\x12\x13" \
14
+ "\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d" \
15
+ "\x1e\x1f\x7f"
16
+ SPACE = "\x09\x0a\x0b\x0c\x0d\x20"
17
+ BLANK = "\x09\x20"
18
+ DIGIT = "\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39"
19
+ ALPHA = "\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4a" \
20
+ "\x4b\x4c\x4d\x4e\x4f\x50\x51\x52\x53\x54" \
21
+ "\x55\x56\x57\x58\x59\x5a\x61\x62\x63\x64" \
22
+ "\x65\x66\x67\x68\x69\x6a\x6b\x6c\x6d\x6e" \
23
+ "\x6f\x70\x71\x72\x73\x74\x75\x76\x77\x78" \
24
+ "\x79\x7a"
25
+ ALNUM = "\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39" \
26
+ "\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4a" \
27
+ "\x4b\x4c\x4d\x4e\x4f\x50\x51\x52\x53\x54" \
28
+ "\x55\x56\x57\x58\x59\x5a\x61\x62\x63\x64" \
29
+ "\x65\x66\x67\x68\x69\x6a\x6b\x6c\x6d\x6e" \
30
+ "\x6f\x70\x71\x72\x73\x74\x75\x76\x77\x78" \
31
+ "\x79\x7a"
32
+ PUNCT = "\x21\x22\x23\x24\x25\x26\x27\x28\x29\x2a" \
33
+ "\x2b\x2c\x2d\x2e\x2f\x3a\x3b\x3c\x3d\x3e" \
34
+ "\x3f\x40\x5b\x5c\x5d\x5e\x5f\x60\x7b\x7c" \
35
+ "\x7d\x7e"
36
+ LOWER = "\x61\x62\x63\x64\x65\x66\x67\x68\x69\x6a" \
37
+ "\x6b\x6c\x6d\x6e\x6f\x70\x71\x72\x73\x74" \
38
+ "\x75\x76\x77\x78\x79\x7a"
39
+ UPPER = "\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4a" \
40
+ "\x4b\x4c\x4d\x4e\x4f\x50\x51\x52\x53\x54" \
41
+ "\x55\x56\x57\x58\x59\x5a"
42
+ PRINT = "\x20\x21\x22\x23\x24\x25\x26\x27\x28\x29" \
43
+ "\x2a\x2b\x2c\x2d\x2e\x2f\x30\x31\x32\x33" \
44
+ "\x34\x35\x36\x37\x38\x39\x3a\x3b\x3c\x3d" \
45
+ "\x3e\x3f\x40\x41\x42\x43\x44\x45\x46\x47" \
46
+ "\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f\x50\x51" \
47
+ "\x52\x53\x54\x55\x56\x57\x58\x59\x5a\x5b" \
48
+ "\x5c\x5d\x5e\x5f\x60\x61\x62\x63\x64\x65" \
49
+ "\x66\x67\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f" \
50
+ "\x70\x71\x72\x73\x74\x75\x76\x77\x78\x79" \
51
+ "\x7a\x7b\x7c\x7d\x7e"
52
+ GRAPH = "\x21\x22\x23\x24\x25\x26\x27\x28\x29\x2a" \
53
+ "\x2b\x2c\x2d\x2e\x2f\x30\x31\x32\x33\x34" \
54
+ "\x35\x36\x37\x38\x39\x3a\x3b\x3c\x3d\x3e" \
55
+ "\x3f\x40\x41\x42\x43\x44\x45\x46\x47\x48" \
56
+ "\x49\x4a\x4b\x4c\x4d\x4e\x4f\x50\x51\x52" \
57
+ "\x53\x54\x55\x56\x57\x58\x59\x5a\x5b\x5c" \
58
+ "\x5d\x5e\x5f\x60\x61\x62\x63\x64\x65\x66" \
59
+ "\x67\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f\x70" \
60
+ "\x71\x72\x73\x74\x75\x76\x77\x78\x79\x7a" \
61
+ "\x7b\x7c\x7d\x7e"
62
+ XDIGIT = "\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39" \
63
+ "\x41\x42\x43\x44\x45\x46\x61\x62\x63\x64" \
64
+ "\x65\x66"
65
+ JA_SPACE = "\xa1\xa1"
66
+ JA_BLANK = "\xa1\xa1"
67
+ JA_ALNUM = "\xa3\xb0\xa3\xb1\xa3\xb2\xa3\xb3\xa3\xb4" \
68
+ "\xa3\xb5\xa3\xb6\xa3\xb7\xa3\xb8\xa3\xb9" \
69
+ "\xa3\xc1\xa3\xc2\xa3\xc3\xa3\xc4\xa3\xc5" \
70
+ "\xa3\xc6\xa3\xc7\xa3\xc8\xa3\xc9\xa3\xca" \
71
+ "\xa3\xcb\xa3\xcc\xa3\xcd\xa3\xce\xa3\xcf" \
72
+ "\xa3\xd0\xa3\xd1\xa3\xd2\xa3\xd3\xa3\xd4" \
73
+ "\xa3\xd5\xa3\xd6\xa3\xd7\xa3\xd8\xa3\xd9" \
74
+ "\xa3\xda\xa3\xe1\xa3\xe2\xa3\xe3\xa3\xe4" \
75
+ "\xa3\xe5\xa3\xe6\xa3\xe7\xa3\xe8\xa3\xe9" \
76
+ "\xa3\xea\xa3\xeb\xa3\xec\xa3\xed\xa3\xee" \
77
+ "\xa3\xef\xa3\xf0\xa3\xf1\xa3\xf2\xa3\xf3" \
78
+ "\xa3\xf4\xa3\xf5\xa3\xf6\xa3\xf7\xa3\xf8" \
79
+ "\xa3\xf9\xa3\xfa"
80
+ JA_PUNCT = "\xa1\xa2\xa1\xa3\xa1\xa4\xa1\xa5\xa1\xa6" \
81
+ "\xa1\xa7\xa1\xa8\xa1\xa9\xa1\xaa\xa1\xab" \
82
+ "\xa1\xac\xa1\xad\xa1\xae\xa1\xaf\xa1\xb0" \
83
+ "\xa1\xb1\xa1\xb2\xa1\xb3\xa1\xb4\xa1\xb5" \
84
+ "\xa1\xb6\xa1\xb7\xa1\xb8\xa1\xb9\xa1\xba" \
85
+ "\xa1\xbb\xa1\xbc\xa1\xbd\xa1\xbe\xa1\xbf" \
86
+ "\xa1\xc0\xa1\xc1\xa1\xc2\xa1\xc3\xa1\xc4" \
87
+ "\xa1\xc5\xa1\xc6\xa1\xc7\xa1\xc8\xa1\xc9" \
88
+ "\xa1\xca\xa1\xcb\xa1\xcc\xa1\xcd\xa1\xce" \
89
+ "\xa1\xcf\xa1\xd0\xa1\xd1\xa1\xd2\xa1\xd3" \
90
+ "\xa1\xd4\xa1\xd5\xa1\xd6\xa1\xd7\xa1\xd8" \
91
+ "\xa1\xd9\xa1\xda\xa1\xdb\xa1\xdc\xa1\xdd" \
92
+ "\xa1\xde\xa1\xdf\xa1\xe0\xa1\xe1\xa1\xe2" \
93
+ "\xa1\xe3\xa1\xe4\xa1\xe5\xa1\xe6\xa1\xe7" \
94
+ "\xa1\xe8\xa1\xe9\xa1\xea\xa1\xeb\xa1\xec" \
95
+ "\xa1\xed\xa1\xee\xa1\xef\xa1\xf0\xa1\xf1" \
96
+ "\xa1\xf2\xa1\xf3\xa1\xf4\xa1\xf5\xa1\xf6" \
97
+ "\xa1\xf7\xa1\xf8\xa1\xf9\xa1\xfa\xa1\xfb" \
98
+ "\xa1\xfc\xa1\xfd\xa1\xfe\xa2\xa1\xa2\xa2" \
99
+ "\xa2\xa3\xa2\xa4\xa2\xa5\xa2\xa6\xa2\xa7" \
100
+ "\xa2\xa8\xa2\xa9\xa2\xaa\xa2\xab\xa2\xac" \
101
+ "\xa2\xad\xa2\xae\xa2\xba\xa2\xbb\xa2\xbc" \
102
+ "\xa2\xbd\xa2\xbe\xa2\xbf\xa2\xc0\xa2\xc1" \
103
+ "\xa2\xca\xa2\xcb\xa2\xcc\xa2\xcd\xa2\xce" \
104
+ "\xa2\xcf\xa2\xd0\xa2\xdc\xa2\xdd\xa2\xde" \
105
+ "\xa2\xdf\xa2\xe0\xa2\xe1\xa2\xe2\xa2\xe3" \
106
+ "\xa2\xe4\xa2\xe5\xa2\xe6\xa2\xe7\xa2\xe8" \
107
+ "\xa2\xe9\xa2\xea\xa2\xf2\xa2\xf3\xa2\xf4" \
108
+ "\xa2\xf5\xa2\xf6\xa2\xf7\xa2\xf8\xa2\xf9" \
109
+ "\xa2\xfe\xa6\xa1\xa6\xa2\xa6\xa3\xa6\xa4" \
110
+ "\xa6\xa5\xa6\xa6\xa6\xa7\xa6\xa8\xa6\xa9" \
111
+ "\xa6\xaa\xa6\xab\xa6\xac\xa6\xad\xa6\xae" \
112
+ "\xa6\xaf\xa6\xb0\xa6\xb1\xa6\xb2\xa6\xb3" \
113
+ "\xa6\xb4\xa6\xb5\xa6\xb6\xa6\xb7\xa6\xb8" \
114
+ "\xa6\xc1\xa6\xc2\xa6\xc3\xa6\xc4\xa6\xc5" \
115
+ "\xa6\xc6\xa6\xc7\xa6\xc8\xa6\xc9\xa6\xca" \
116
+ "\xa6\xcb\xa6\xcc\xa6\xcd\xa6\xce\xa6\xcf" \
117
+ "\xa6\xd0\xa6\xd1\xa6\xd2\xa6\xd3\xa6\xd4" \
118
+ "\xa6\xd5\xa6\xd6\xa6\xd7\xa6\xd8\xa7\xa1" \
119
+ "\xa7\xa2\xa7\xa3\xa7\xa4\xa7\xa5\xa7\xa6" \
120
+ "\xa7\xa7\xa7\xa8\xa7\xa9\xa7\xaa\xa7\xab" \
121
+ "\xa7\xac\xa7\xad\xa7\xae\xa7\xaf\xa7\xb0" \
122
+ "\xa7\xb1\xa7\xb2\xa7\xb3\xa7\xb4\xa7\xb5" \
123
+ "\xa7\xb6\xa7\xb7\xa7\xb8\xa7\xb9\xa7\xba" \
124
+ "\xa7\xbb\xa7\xbc\xa7\xbd\xa7\xbe\xa7\xbf" \
125
+ "\xa7\xc0\xa7\xc1\xa7\xd1\xa7\xd2\xa7\xd3" \
126
+ "\xa7\xd4\xa7\xd5\xa7\xd6\xa7\xd7\xa7\xd8" \
127
+ "\xa7\xd9\xa7\xda\xa7\xdb\xa7\xdc\xa7\xdd" \
128
+ "\xa7\xde\xa7\xdf\xa7\xe0\xa7\xe1\xa7\xe2" \
129
+ "\xa7\xe3\xa7\xe4\xa7\xe5\xa7\xe6\xa7\xe7" \
130
+ "\xa7\xe8\xa7\xe9\xa7\xea\xa7\xeb\xa7\xec" \
131
+ "\xa7\xed\xa7\xee\xa7\xef\xa7\xf0\xa7\xf1" \
132
+ "\xa8\xa1\xa8\xa2\xa8\xa3\xa8\xa4\xa8\xa5" \
133
+ "\xa8\xa6\xa8\xa7\xa8\xa8\xa8\xa9\xa8\xaa" \
134
+ "\xa8\xab\xa8\xac\xa8\xad\xa8\xae\xa8\xaf" \
135
+ "\xa8\xb0\xa8\xb1\xa8\xb2\xa8\xb3\xa8\xb4" \
136
+ "\xa8\xb5\xa8\xb6\xa8\xb7\xa8\xb8\xa8\xb9" \
137
+ "\xa8\xba\xa8\xbb\xa8\xbc\xa8\xbd\xa8\xbe" \
138
+ "\xa8\xbf\xa8\xc0"
139
+ HIRA = "\xa4\xa1\xa4\xa2\xa4\xa3\xa4\xa4\xa4\xa5" \
140
+ "\xa4\xa6\xa4\xa7\xa4\xa8\xa4\xa9\xa4\xaa" \
141
+ "\xa4\xab\xa4\xac\xa4\xad\xa4\xae\xa4\xaf" \
142
+ "\xa4\xb0\xa4\xb1\xa4\xb2\xa4\xb3\xa4\xb4" \
143
+ "\xa4\xb5\xa4\xb6\xa4\xb7\xa4\xb8\xa4\xb9" \
144
+ "\xa4\xba\xa4\xbb\xa4\xbc\xa4\xbd\xa4\xbe" \
145
+ "\xa4\xbf\xa4\xc0\xa4\xc1\xa4\xc2\xa4\xc3" \
146
+ "\xa4\xc4\xa4\xc5\xa4\xc6\xa4\xc7\xa4\xc8" \
147
+ "\xa4\xc9\xa4\xca\xa4\xcb\xa4\xcc\xa4\xcd" \
148
+ "\xa4\xce\xa4\xcf\xa4\xd0\xa4\xd1\xa4\xd2" \
149
+ "\xa4\xd3\xa4\xd4\xa4\xd5\xa4\xd6\xa4\xd7" \
150
+ "\xa4\xd8\xa4\xd9\xa4\xda\xa4\xdb\xa4\xdc" \
151
+ "\xa4\xdd\xa4\xde\xa4\xdf\xa4\xe0\xa4\xe1" \
152
+ "\xa4\xe2\xa4\xe3\xa4\xe4\xa4\xe5\xa4\xe6" \
153
+ "\xa4\xe7\xa4\xe8\xa4\xe9\xa4\xea\xa4\xeb" \
154
+ "\xa4\xec\xa4\xed\xa4\xee\xa4\xef\xa4\xf0" \
155
+ "\xa4\xf1\xa4\xf2\xa4\xf3" # [\xa4\xa1-\xa4\xf3]
156
+ HIRA_EX = HIRA +
157
+ "\xa1\xbc" +
158
+ "\xa1\xb3\xa1\xb4" # add onbiki and kanagaeshi(hira)
159
+ KATA = "\xa5\xa1\xa5\xa2\xa5\xa3\xa5\xa4\xa5\xa5" \
160
+ "\xa5\xa6\xa5\xa7\xa5\xa8\xa5\xa9\xa5\xaa" \
161
+ "\xa5\xab\xa5\xac\xa5\xad\xa5\xae\xa5\xaf" \
162
+ "\xa5\xb0\xa5\xb1\xa5\xb2\xa5\xb3\xa5\xb4" \
163
+ "\xa5\xb5\xa5\xb6\xa5\xb7\xa5\xb8\xa5\xb9" \
164
+ "\xa5\xba\xa5\xbb\xa5\xbc\xa5\xbd\xa5\xbe" \
165
+ "\xa5\xbf\xa5\xc0\xa5\xc1\xa5\xc2\xa5\xc3" \
166
+ "\xa5\xc4\xa5\xc5\xa5\xc6\xa5\xc7\xa5\xc8" \
167
+ "\xa5\xc9\xa5\xca\xa5\xcb\xa5\xcc\xa5\xcd" \
168
+ "\xa5\xce\xa5\xcf\xa5\xd0\xa5\xd1\xa5\xd2" \
169
+ "\xa5\xd3\xa5\xd4\xa5\xd5\xa5\xd6\xa5\xd7" \
170
+ "\xa5\xd8\xa5\xd9\xa5\xda\xa5\xdb\xa5\xdc" \
171
+ "\xa5\xdd\xa5\xde\xa5\xdf\xa5\xe0\xa5\xe1" \
172
+ "\xa5\xe2\xa5\xe3\xa5\xe4\xa5\xe5\xa5\xe6" \
173
+ "\xa5\xe7\xa5\xe8\xa5\xe9\xa5\xea\xa5\xeb" \
174
+ "\xa5\xec\xa5\xed\xa5\xee\xa5\xef\xa5\xf0" \
175
+ "\xa5\xf1\xa5\xf2\xa5\xf3\xa5\xf4\xa5\xf5" \
176
+ "\xa5\xf6" # [\xa5\xa1-\xa5\xf6]
177
+ KATA_EX = KATA +
178
+ "\xa1\xbc" +
179
+ "\xa1\xb5\xa1\xb6" # add onbiki and kanagaeshi(kata)
180
+ KANJI = "\xb0\xa1-\xb0\xfe" \
181
+ "\xb1\xa1-\xb1\xfe" \
182
+ "\xb2\xa1-\xb2\xfe" \
183
+ "\xb3\xa1-\xb3\xfe" \
184
+ "\xb4\xa1-\xb4\xfe" \
185
+ "\xb5\xa1-\xb5\xfe" \
186
+ "\xb6\xa1-\xb6\xfe" \
187
+ "\xb7\xa1-\xb7\xfe" \
188
+ "\xb8\xa1-\xb8\xfe" \
189
+ "\xb9\xa1-\xb9\xfe" \
190
+ "\xba\xa1-\xba\xfe" \
191
+ "\xbb\xa1-\xbb\xfe" \
192
+ "\xbc\xa1-\xbc\xfe" \
193
+ "\xbd\xa1-\xbd\xfe" \
194
+ "\xbe\xa1-\xbe\xfe" \
195
+ "\xbf\xa1-\xbf\xfe" \
196
+ "\xc0\xa1-\xc0\xfe" \
197
+ "\xc1\xa1-\xc1\xfe" \
198
+ "\xc2\xa1-\xc2\xfe" \
199
+ "\xc3\xa1-\xc3\xfe" \
200
+ "\xc4\xa1-\xc4\xfe" \
201
+ "\xc5\xa1-\xc5\xfe" \
202
+ "\xc6\xa1-\xc6\xfe" \
203
+ "\xc7\xa1-\xc7\xfe" \
204
+ "\xc8\xa1-\xc8\xfe" \
205
+ "\xc9\xa1-\xc9\xfe" \
206
+ "\xca\xa1-\xca\xfe" \
207
+ "\xcb\xa1-\xcb\xfe" \
208
+ "\xcc\xa1-\xcc\xfe" \
209
+ "\xcd\xa1-\xcd\xfe" \
210
+ "\xce\xa1-\xce\xfe" \
211
+ "\xcf\xa1-\xcf\xd3" \
212
+ "\xd0\xa1-\xd0\xfe" \
213
+ "\xd1\xa1-\xd1\xfe" \
214
+ "\xd2\xa1-\xd2\xfe" \
215
+ "\xd3\xa1-\xd3\xfe" \
216
+ "\xd4\xa1-\xd4\xfe" \
217
+ "\xd5\xa1-\xd5\xfe" \
218
+ "\xd6\xa1-\xd6\xfe" \
219
+ "\xd7\xa1-\xd7\xfe" \
220
+ "\xd8\xa1-\xd8\xfe" \
221
+ "\xd9\xa1-\xd9\xfe" \
222
+ "\xda\xa1-\xda\xfe" \
223
+ "\xdb\xa1-\xdb\xfe" \
224
+ "\xdc\xa1-\xdc\xfe" \
225
+ "\xdd\xa1-\xdd\xfe" \
226
+ "\xde\xa1-\xde\xfe" \
227
+ "\xdf\xa1-\xdf\xfe" \
228
+ "\xe0\xa1-\xe0\xfe" \
229
+ "\xe1\xa1-\xe1\xfe" \
230
+ "\xe2\xa1-\xe2\xfe" \
231
+ "\xe3\xa1-\xe3\xfe" \
232
+ "\xe4\xa1-\xe4\xfe" \
233
+ "\xe5\xa1-\xe5\xfe" \
234
+ "\xe6\xa1-\xe6\xfe" \
235
+ "\xe7\xa1-\xe7\xfe" \
236
+ "\xe8\xa1-\xe8\xfe" \
237
+ "\xe9\xa1-\xe9\xfe" \
238
+ "\xea\xa1-\xea\xfe" \
239
+ "\xeb\xa1-\xeb\xfe" \
240
+ "\xec\xa1-\xec\xfe" \
241
+ "\xed\xa1-\xed\xfe" \
242
+ "\xee\xa1-\xee\xfe" \
243
+ "\xef\xa1-\xef\xfe" \
244
+ "\xf0\xa1-\xf0\xfe" \
245
+ "\xf1\xa1-\xf1\xfe" \
246
+ "\xf2\xa1-\xf2\xfe" \
247
+ "\xf3\xa1-\xf3\xfe" \
248
+ "\xf4\xa1-\xf4\xa6"
249
+ KANJI_EX = KANJI + "\xa1\xb9" # + noma
250
+ JA_GRAPH = JA_ALNUM + JA_PUNCT + HIRA + KATA + KANJI
251
+ JA_PRINT = JA_GRAPH + JA_BLANK
252
+
253
+ PUNCT.replace(Regexp.quote(PUNCT)) # kludge to avoid warning "character class has `[' without escape"
254
+ PRINT.replace(Regexp.quote(PRINT)) # kludge to avoid warning "character class has `[' without escape"
255
+ GRAPH.replace(Regexp.quote(GRAPH)) # kludge to avoid warning "character class has `[' without escape"
256
+
257
+ WORD_REGEXP_SRC = ["(?:[#{GRAPH}]+[#{BLANK}]?)",
258
+ "|(?:[#{SPACE}]+)",
259
+ "|(?:[#{KANJI_EX}]+[#{HIRA}]+)",
260
+ "|(?:[#{KATA_EX}]+[#{HIRA}]+)",
261
+ "|(?:[#{KANJI_EX}]+)",
262
+ "|(?:[#{KATA_EX}]+)",
263
+ "|(?:[#{HIRA_EX}]+)",
264
+ "|(?:.+?)"].join
265
+
266
+ CharString.register_encoding(self)
267
+
268
+ end # module EUCJP
269
+ end