docdiff 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (73) hide show
  1. data/.gitignore +6 -0
  2. data/.travis.yml +7 -0
  3. data/Gemfile +17 -0
  4. data/Guardfile +8 -0
  5. data/Makefile +108 -0
  6. data/Rakefile +17 -0
  7. data/bin/docdiff +179 -0
  8. data/devutil/JIS0208.TXT +6952 -0
  9. data/devutil/char_by_charclass.rb +23 -0
  10. data/devutil/charclass_by_char.rb +21 -0
  11. data/devutil/jis0208.rb +343 -0
  12. data/devutil/testjis0208.rb +38 -0
  13. data/docdiff.conf.example +22 -0
  14. data/docdiff.gemspec +23 -0
  15. data/docdiffwebui.cgi +176 -0
  16. data/docdiffwebui.html +123 -0
  17. data/img/docdiff-screenshot-format-html-digest-firefox.png +0 -0
  18. data/img/docdiff-screenshot-format-html-firefox.png +0 -0
  19. data/img/docdiff-screenshot-format-tty-cmdexe-en.png +0 -0
  20. data/img/docdiff-screenshot-format-tty-cmdexe-ja.png +0 -0
  21. data/img/docdiff-screenshot-format-tty-rxvtunicode-en.png +0 -0
  22. data/img/docdiff-screenshot-format-tty-rxvtunicode-ja.png +0 -0
  23. data/img/docdiff-screenshot-format-tty-xterm-en.png +0 -0
  24. data/img/docdiff-screenshot-format-tty-xterm-ja.png +0 -0
  25. data/img/docdiff-screenshot-resolution-linewordchar-xterm.png +0 -0
  26. data/index.html +181 -0
  27. data/langfilter.rb +14 -0
  28. data/lib/doc_diff.rb +170 -0
  29. data/lib/docdiff.rb +7 -0
  30. data/lib/docdiff/charstring.rb +579 -0
  31. data/lib/docdiff/diff.rb +217 -0
  32. data/lib/docdiff/diff/contours.rb +382 -0
  33. data/lib/docdiff/diff/editscript.rb +148 -0
  34. data/lib/docdiff/diff/rcsdiff.rb +107 -0
  35. data/lib/docdiff/diff/shortestpath.rb +93 -0
  36. data/lib/docdiff/diff/speculative.rb +40 -0
  37. data/lib/docdiff/diff/subsequence.rb +39 -0
  38. data/lib/docdiff/diff/unidiff.rb +124 -0
  39. data/lib/docdiff/difference.rb +92 -0
  40. data/lib/docdiff/document.rb +127 -0
  41. data/lib/docdiff/encoding/en_ascii.rb +97 -0
  42. data/lib/docdiff/encoding/ja_eucjp.rb +269 -0
  43. data/lib/docdiff/encoding/ja_sjis.rb +260 -0
  44. data/lib/docdiff/encoding/ja_utf8.rb +6974 -0
  45. data/lib/docdiff/version.rb +3 -0
  46. data/lib/docdiff/view.rb +476 -0
  47. data/lib/viewdiff.rb +375 -0
  48. data/readme.html +713 -0
  49. data/sample/01.en.ascii.cr +1 -0
  50. data/sample/01.en.ascii.crlf +2 -0
  51. data/sample/01.en.ascii.lf +2 -0
  52. data/sample/01.ja.eucjp.lf +2 -0
  53. data/sample/01.ja.sjis.cr +1 -0
  54. data/sample/01.ja.sjis.crlf +2 -0
  55. data/sample/01.ja.utf8.crlf +2 -0
  56. data/sample/02.en.ascii.cr +1 -0
  57. data/sample/02.en.ascii.crlf +2 -0
  58. data/sample/02.en.ascii.lf +2 -0
  59. data/sample/02.ja.eucjp.lf +2 -0
  60. data/sample/02.ja.sjis.cr +1 -0
  61. data/sample/02.ja.sjis.crlf +2 -0
  62. data/sample/02.ja.utf8.crlf +2 -0
  63. data/sample/humpty_dumpty01.ascii.lf +4 -0
  64. data/sample/humpty_dumpty02.ascii.lf +4 -0
  65. data/test/charstring_test.rb +1008 -0
  66. data/test/diff_test.rb +36 -0
  67. data/test/difference_test.rb +64 -0
  68. data/test/docdiff_test.rb +193 -0
  69. data/test/document_test.rb +626 -0
  70. data/test/test_helper.rb +7 -0
  71. data/test/view_test.rb +570 -0
  72. data/test/viewdiff_test.rb +908 -0
  73. metadata +129 -0
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/ruby
2
+ # language filter
3
+ # usage: langfilter.rb --en <infile >outfile
4
+
5
+ def ruby_m17n?
6
+ return true if "".respond_to? :encoding
7
+ end
8
+
9
+ lang_to_include = ARGV.shift.gsub(/-+/, "")
10
+ lang_to_exclude = {"en"=>"ja", "ja"=>"en"}[lang_to_include]
11
+ re = /<([a-z]+) +(?:(?:lang|title)="#{lang_to_exclude}").*?>.*?<\/\1>[\r\n]?/m
12
+
13
+ ARGF.set_encoding("UTF-8") if ruby_m17n?
14
+ ARGF.read.gsub(re, "").display
@@ -0,0 +1,170 @@
1
+ # DocDiff: word/character-oriented text comparison utility
2
+ # Copyright (C) 2002-2011 Hisashi MORITA
3
+ # Requirements: Ruby (>= 1.8)
4
+ class DocDiff
5
+
6
+ AppVersion = Docdiff::VERSION
7
+ Author = "Copyright (C) 2002-2011 Hisashi MORITA.\n" +
8
+ "diff library originates from Ruby/CVS by TANAKA Akira.\n"
9
+ License = "This software is licensed under so-called modified BSD license.\n" +
10
+ "See the document for detail.\n"
11
+ SystemConfigFileName = File.join(File::Separator, "etc", "docdiff", "docdiff.conf")
12
+ UserConfigFileName = File.join(ENV['HOME'], "etc", "docdiff", "docdiff.conf")
13
+ AltUserConfigFileName = File.join(ENV['HOME'], ".docdiff", "docdiff.conf")
14
+
15
+ def initialize()
16
+ @config = {}
17
+ end
18
+ attr_accessor :config
19
+
20
+ def DocDiff.parse_config_file_content(content)
21
+ result = {}
22
+ return result if content.size <= 0
23
+ lines = content.dup.split(/\r\n|\r|\n/).compact
24
+ lines.collect!{|line| line.sub(/#.*$/, '')}
25
+ lines.collect!{|line| line.strip}
26
+ lines.delete_if{|line| line == ""}
27
+ lines.each{|line|
28
+ raise 'line does not include " = ".' unless /[\s]+=[\s]+/.match line
29
+ name_src, value_src = line.split(/[\s]+=[\s]+/)
30
+ raise "Invalid name: #{name_src.inspect}" if (/\s/.match name_src)
31
+ raise "Invalid value: #{value_src.inspect}" unless value_src.kind_of?(String)
32
+ name = name_src.intern
33
+ value = value_src
34
+ value = true if ['on','yes','true'].include? value_src.downcase
35
+ value = false if ['off','no','false'].include? value_src.downcase
36
+ value = value_src.to_i if /^[0-9]+$/.match value_src
37
+ result[name] = value
38
+ }
39
+ result
40
+ end
41
+
42
+ def compare_by_line(doc1, doc2)
43
+ Difference.new(doc1.split_to_line, doc2.split_to_line)
44
+ end
45
+
46
+ def compare_by_line_word(doc1, doc2)
47
+ lines = compare_by_line(doc1, doc2)
48
+ words = Difference.new
49
+ lines.each{|line|
50
+ if line.first == :change_elt
51
+ before_change = Document.new(line[1].join,
52
+ doc1.encoding, doc1.eol)
53
+ after_change = Document.new(line[2].join,
54
+ doc2.encoding, doc2.eol)
55
+ Difference.new(before_change.split_to_word,
56
+ after_change.split_to_word).each{|word|
57
+ words << word
58
+ }
59
+ else # :common_elt_elt, :del_elt, or :add_elt
60
+ words << line
61
+ end
62
+ }
63
+ words
64
+ end
65
+
66
+ # i know this implementation of recursion is so lame...
67
+ def compare_by_line_word_char(doc1, doc2)
68
+ lines = compare_by_line(doc1, doc2)
69
+ lines_and_words = Difference.new
70
+ lines.each{|line|
71
+ if line.first == :change_elt
72
+ before_change = Document.new(line[1].join,
73
+ doc1.encoding, doc1.eol)
74
+ after_change = Document.new(line[2].join,
75
+ doc2.encoding, doc2.eol)
76
+ Difference.new(before_change.split_to_word,
77
+ after_change.split_to_word).each{|word|
78
+ lines_and_words << word
79
+ }
80
+ else # :common_elt_elt, :del_elt, or :add_elt
81
+ lines_and_words << line
82
+ end
83
+ }
84
+ lines_words_and_chars = Difference.new
85
+ lines_and_words.each{|line_or_word|
86
+ if line_or_word.first == :change_elt
87
+ before_change = Document.new(line_or_word[1].join, doc1.encoding, doc1.eol)
88
+ after_change = Document.new(line_or_word[2].join, doc2.encoding, doc2.eol)
89
+ Difference.new(before_change.split_to_char, after_change.split_to_char).each{|char|
90
+ lines_words_and_chars << char
91
+ }
92
+ else # :common_elt_elt, :del_elt, or :add_elt
93
+ lines_words_and_chars << line_or_word
94
+ end
95
+ }
96
+ lines_words_and_chars
97
+ end
98
+
99
+ def run(doc1, doc2, option)
100
+ raise "option is nil" if option.nil?
101
+ raise "option[:resolution] is nil" if option[:resolution].nil?
102
+ raise "option[:format] is nil" if option[:format].nil?
103
+ case
104
+ when doc1.class == Document && doc2.class == Document # OK
105
+ when doc1.encoding != nil && doc2.encoding != nil # OK
106
+ when doc1.encoding == doc2.encoding && doc1.eol == doc2.eol # OK
107
+ else
108
+ raise("Error! Blame the author (doc1: #{doc1.encoding}, #{doc1.eol}, doc2: #{doc2.encoding}, #{doc2.eol}).")
109
+ end
110
+
111
+ case option[:resolution]
112
+ when "line"; then difference = compare_by_line(doc1, doc2)
113
+ when "word"; then difference = compare_by_line_word(doc1, doc2)
114
+ when "char"; then difference = compare_by_line_word_char(doc1, doc2)
115
+ else
116
+ raise "Unsupported resolution: #{option[:resolution].inspect}"
117
+ end
118
+ view = View.new(difference, doc1.encoding, doc1.eol)
119
+ user_tags = {:start_common => (@config[:tag_common_start] ||= ''),
120
+ :end_common => (@config[:tag_common_end] ||= ''),
121
+ :start_del => (@config[:tag_del_start] ||= ''),
122
+ :end_del => (@config[:tag_del_end] ||= ''),
123
+ :start_add => (@config[:tag_add_start] ||= ''),
124
+ :end_add => (@config[:tag_add_end] ||= ''),
125
+ :start_before_change => (@config[:tag_change_before_start] ||= ''),
126
+ :end_before_change => (@config[:tag_change_before_end] ||= ''),
127
+ :start_after_change => (@config[:tag_change_after_start] ||= ''),
128
+ :end_after_change => (@config[:tag_change_after_end] ||= '')}
129
+ case option[:digest]
130
+ when true
131
+ case option[:format]
132
+ when "tty"; then result = view.to_tty_digest(option)
133
+ when "html"; then result = view.to_html_digest(option)
134
+ when "manued"; then result = view.to_manued_digest(option)
135
+ when "wdiff"; then result = view.to_wdiff_digest(option)
136
+ when "stat"; then result = view.to_stat(option)
137
+ when "user"; then result = view.to_user_digest(user_tags)
138
+ else
139
+ raise "Unsupported output format: #{option[:format].inspect}."
140
+ end
141
+ when false
142
+ case option[:format]
143
+ when "tty"; then result = view.to_tty(option)
144
+ when "html"; then result = view.to_html(option)
145
+ when "manued"; then result = view.to_manued(option)
146
+ when "wdiff"; then result = view.to_wdiff(option)
147
+ when "stat"; then result = view.to_stat(option)
148
+ when "user"; then result = view.to_user(user_tags)
149
+ else
150
+ raise "Unsupported output format: #{option[:format].inspect}."
151
+ end
152
+ end
153
+ result.join
154
+ end
155
+
156
+ def process_config_file(filename)
157
+ file_content = nil
158
+ begin
159
+ File.open(filename, "r"){|f| file_content = f.read}
160
+ rescue Errno::ENOENT
161
+ message = "config file not found so not read."
162
+ ensure
163
+ if file_content != nil
164
+ self.config.update(DocDiff.parse_config_file_content(file_content))
165
+ end
166
+ end
167
+ message
168
+ end
169
+
170
+ end # class DocDiff
@@ -0,0 +1,7 @@
1
+ # DocDiff: word/character-oriented text comparison utility
2
+ # Copyright (C) 2002-2011 Hisashi MORITA
3
+ # Requirements: Ruby (>= 1.8)
4
+ require 'docdiff/version'
5
+ require 'doc_diff'
6
+ module Docdiff
7
+ end
@@ -0,0 +1,579 @@
1
+ #!/usr/bin/ruby
2
+ # Character String module.
3
+ # To use, include to String, or extend String.
4
+ # 2003- Hisashi MORITA
5
+
6
+ module CharString
7
+
8
+ Encodings = {}
9
+ EOLChars = {} # End-of-line characters, such as CR, LF, CRLF.
10
+
11
+ def initialize(string)
12
+ =begin unnecessary
13
+ # @encoding = CharString.guess_encoding(string)
14
+ # @eol = CharString.guess_eol(string)
15
+ =end unnecessary
16
+ super
17
+ end
18
+
19
+ def eol()
20
+ @eol
21
+ # if @eol
22
+ # @eol
23
+ # else
24
+ # @eol = CharString.guess_eol(self)
25
+ # # raise "eol is not set.\n"
26
+ # end
27
+ end
28
+
29
+ def eol=(e)
30
+ @eol = e
31
+ extend EOLChars[@eol]
32
+ end
33
+
34
+ def eol_char()
35
+ if @eol_char
36
+ @eol_char
37
+ else
38
+ nil
39
+ # extend EOLChars[eol]
40
+ # eol_char
41
+ end
42
+ end
43
+
44
+ def debug()
45
+ case
46
+ when @encoding == nil
47
+ raise "@encoding is nil."
48
+ when Encodings[@encoding] == nil
49
+ raise "Encodings[@encoding(=#{@encoding})] is nil."
50
+ when Encodings[@encoding].class != Module
51
+ raise "Encodings[@encoding].class(=#{Encodings[@encoding].class}) is not a module."
52
+ when @eol == nil
53
+ raise "@eol is nil."
54
+ when EOLChars[@eol] == nil
55
+ raise "EOLChars[@eol(=#{@eol})] is nil."
56
+ else
57
+ # should I do some alert?
58
+ end
59
+ ["id: #{self.id}, class: #{self.class}, self: #{self}, ",
60
+ "module: #{Encodings[@encoding]}, #{EOLChars[@eol]}"].join
61
+ end
62
+
63
+ def CharString.register_encoding(mod)
64
+ Encodings[mod::Encoding] = mod
65
+ end
66
+
67
+ def CharString.register_eol(mod)
68
+ EOLChars[mod::EOL] = mod
69
+ end
70
+
71
+ def CharString.guess_eol(string)
72
+ # returns 'CR', 'LF', 'CRLF', 'UNKNOWN'(binary),
73
+ # 'NONE'(1-line), or nil
74
+ return nil if string == nil #=> nil (argument missing)
75
+ eol_counts = {'CR' => string.scan(/(\r)(?!\n)/o).size,
76
+ 'LF' => string.scan(/(?:\A|[^\r])(\n)/o).size,
77
+ 'CRLF' => string.scan(/(\r\n)/o).size}
78
+ eol_counts.delete_if{|eol, count| count == 0} # Remove missing EOL
79
+ eols = eol_counts.keys
80
+ eol_variety = eols.size # numbers of flavors found
81
+ if eol_variety == 1 # Only one type of EOL found
82
+ return eols[0] #=> 'CR', 'LF', or 'CRLF'
83
+ elsif eol_variety == 0 # No EOL found
84
+ return 'NONE' #=> 'NONE' (might be 1-line file)
85
+ else # Multiple types of EOL found
86
+ return 'UNKNOWN' #=> 'UNKNOWN' (might be binary data)
87
+ end
88
+ end
89
+
90
+ def CharString.ruby_m17n?
91
+ "".respond_to?(:force_encoding)
92
+ end
93
+
94
+ # Note that some languages (like Japanese) do not have 'word' or 'phrase',
95
+ # thus some of the following methods are not 'linguistically correct'.
96
+
97
+ def count_byte()
98
+ split_to_byte().size
99
+ end
100
+
101
+ def count_char() # eol = 1 char
102
+ split_to_char().size
103
+ end
104
+
105
+ def count_graph_char()
106
+ count_latin_graph_char() + count_ja_graph_char()
107
+ end
108
+
109
+ def count_blank_char()
110
+ count_latin_blank_char() + count_ja_blank_char()
111
+ end
112
+
113
+ def count_word()
114
+ split_to_word().size
115
+ end
116
+
117
+ def count_valid_word()
118
+ count_latin_valid_word() + count_ja_valid_word()
119
+ end
120
+
121
+ def count_line() # this is common to all encodings.
122
+ split_to_line.size
123
+ end
124
+
125
+ def count_empty_line()
126
+ split_to_line.collect{|line|
127
+ line if /^(?:#{eol_char})|^$/m.match line
128
+ }.compact.size
129
+ end
130
+
131
+ if ruby_m17n?
132
+ # for Ruby-1.9
133
+ def encoding()
134
+ String.new(self).encoding.to_s
135
+ end
136
+
137
+ def encoding=(cs)
138
+ force_encoding(cs) if self
139
+ end
140
+
141
+ def CharString.guess_encoding(string)
142
+ if string
143
+ string.encoding.to_s
144
+ else
145
+ nil
146
+ end
147
+ end
148
+
149
+ def split_to_byte()
150
+ encode("ASCII-8BIT").scan(/./nm)
151
+ end
152
+
153
+ def split_to_char()
154
+ if eol_char # sometimes string has no end-of-line char
155
+ encode('UTF-8').scan(Regexp.new("(?:#{eol_char})|(?:.)",
156
+ Regexp::MULTILINE)
157
+ ).map{|e| e.encode(self.encoding)}
158
+ else # it seems that no EOL module was extended...
159
+ encode('UTF-8').scan(Regexp.new("(?:.)",
160
+ Regexp::MULTILINE)
161
+ ).map{|e| e.encode(self.encoding)}
162
+ end
163
+ end
164
+
165
+ def count_latin_graph_char()
166
+ encode('UTF-8').scan(Regexp.new("[#{Encodings['UTF-8']::GRAPH}]",
167
+ Regexp::MULTILINE)
168
+ ).size
169
+ end
170
+
171
+ def count_ja_graph_char()
172
+ encode('UTF-8').scan(Regexp.new("[#{Encodings['UTF-8']::JA_GRAPH}]",
173
+ Regexp::MULTILINE)
174
+ ).size
175
+ end
176
+
177
+ def count_latin_blank_char()
178
+ encode('UTF-8').scan(Regexp.new("[#{Encodings['UTF-8']::BLANK}]",
179
+ Regexp::MULTILINE)
180
+ ).size
181
+ end
182
+
183
+ def count_ja_blank_char()
184
+ encode('UTF-8').scan(Regexp.new("[#{Encodings['UTF-8']::JA_BLANK}]",
185
+ Regexp::MULTILINE)
186
+ ).size
187
+ end
188
+
189
+ def split_to_word()
190
+ encode('UTF-8').scan(Regexp.new(Encodings['UTF-8']::WORD_REGEXP_SRC,
191
+ Regexp::MULTILINE)
192
+ ).map{|e| e.encode(self.encoding)}
193
+ end
194
+
195
+ def count_latin_word()
196
+ split_to_word.collect{|word|
197
+ word if Regexp.new("[#{Encodings['UTF-8']::PRINT}]",
198
+ Regexp::MULTILINE).match word.encode('UTF-8')
199
+ }.compact.size
200
+ end
201
+
202
+ def count_ja_word()
203
+ split_to_word.collect{|word|
204
+ word if Regexp.new("[#{Encodings['UTF-8']::JA_PRINT}]",
205
+ Regexp::MULTILINE).match word.encode('UTF-8')
206
+ }.compact.size
207
+ end
208
+
209
+ def count_latin_valid_word()
210
+ split_to_word.collect{|word|
211
+ word if Regexp.new("[#{Encodings['UTF-8']::ALNUM}]",
212
+ Regexp::MULTILINE).match word.encode('UTF-8')
213
+ }.compact.size
214
+ end
215
+
216
+ def count_ja_valid_word()
217
+ split_to_word.collect{|word|
218
+ word if Regexp.new("[#{Encodings['UTF-8']::JA_GRAPH}]",
219
+ Regexp::MULTILINE).match word.encode('UTF-8')
220
+ }.compact.size
221
+ end
222
+
223
+ def split_to_line()
224
+ raise "EOLChars[eol] is #{EOLChars[eol].inspect}: eol not specified or auto-detection failed." unless EOLChars[eol]
225
+ if defined? eol_char
226
+ encode('UTF-8').scan(Regexp.new(".*?#{eol_char}|.+",
227
+ Regexp::MULTILINE)
228
+ ).map{|e| e.encode(self.encoding)}
229
+ else
230
+ encode('UTF-8').scan(Regexp.new(".+",
231
+ Regexp::MULTILINE)
232
+ ).map{|e| e.encode(self.encoding)}
233
+ end
234
+ end
235
+
236
+ def count_graph_line()
237
+ split_to_line.collect{|line|
238
+ line if Regexp.new("[#{Encodings['UTF-8']::GRAPH}" +
239
+ "#{Encodings['UTF-8']::JA_GRAPH}]",
240
+ Regexp::MULTILINE).match line.encode('UTF-8')
241
+ }.compact.size
242
+ end
243
+
244
+ def count_blank_line()
245
+ split_to_line.collect{|line|
246
+ line if Regexp.new("^[#{Encodings['UTF-8']::BLANK}" +
247
+ "#{Encodings['UTF-8']::JA_BLANK}]+(?:#{eol_char})?",
248
+ Regexp::MULTILINE).match line.encode('UTF-8')
249
+ }.compact.size
250
+ end
251
+
252
+ # load encoding modules
253
+ require 'docdiff/encoding/en_ascii'
254
+ require 'docdiff/encoding/ja_eucjp'
255
+ require 'docdiff/encoding/ja_sjis'
256
+ require 'docdiff/encoding/ja_utf8'
257
+ else
258
+ # for Ruby-1.8
259
+ require 'iconv'
260
+
261
+ def encoding()
262
+ @encoding
263
+ # if @encoding
264
+ # @encoding
265
+ # else
266
+ # @encoding = CharString.guess_encoding(self)
267
+ # # raise "encoding is not set.\n"
268
+ # end
269
+ end
270
+
271
+ def encoding=(cs)
272
+ @encoding = cs
273
+ extend Encodings[@encoding] # ; p "Hey, I extended #{Encodings[@encoding]}!"
274
+ end
275
+
276
+ # returns nil, 'US-ASCII', 'JIS', 'EUC-JP', 'Shift_JIS', 'UTF-8', or 'UNKNOWN'
277
+ def CharString.guess_encoding(string)
278
+ return nil if string == nil
279
+ result_using_pureruby = CharString.guess_encoding_using_pureruby(string)
280
+ result_using_iconv = CharString.guess_encoding_using_iconv(string)
281
+ if result_using_pureruby == result_using_iconv
282
+ result_using_pureruby
283
+ else
284
+ "UNKNOWN"
285
+ end
286
+ end
287
+
288
+ # returns nil, 'US-ASCII', 'JIS', 'EUC-JP', 'Shift_JIS', 'UTF-8', or 'UNKNOWN'
289
+ def CharString.guess_encoding_using_pureruby(string)
290
+ return nil if string == nil
291
+
292
+ ascii_pat = '[\x00-\x7f]'
293
+ jis_pat = ['(?:(?:\x1b\x28\x42)',
294
+ '|(?:\x1b\x28\x4a)',
295
+ '|(?:\x1b\x28\x49)',
296
+ '|(?:\x1b\x24\x40)',
297
+ '|(?:\x1b\x24\x42)',
298
+ '|(?:\x1b\x24\x44))'].join
299
+ eucjp_pat = ['(?:(?:[\x00-\x1f\x7f])',
300
+ '|(?:[\x20-\x7e])',
301
+ '|(?:\x8e[\xa1-\xdf])',
302
+ '|(?:[\xa1-\xfe][\xa1-\xfe])',
303
+ '|(?:\x8f[\xa1-\xfe][\xa1-\xfe]))'].join
304
+ sjis_pat = ['(?:(?:[\x00-\x1f\x7f])',
305
+ '|(?:[\x20-\x7e])',
306
+ '|(?:[\xa1-\xdf])',
307
+ '|(?:[\x81-\x9f][\x40-\x7e])',
308
+ '|(?:[\xe0-\xef][\x80-\xfc]))'].join
309
+ utf8_pat = ['(?:(?:[\x00-\x7f])',
310
+ '|(?:[\xc0-\xdf][\x80-\xbf])',
311
+ '|(?:[\xe0-\xef][\x80-\xbf][\x80-\xbf])',
312
+ '|(?:[\xf0-\xf7][\x80-\xbf][\x80-\xbf][\x80-\xbf]))'].join
313
+
314
+ ascii_match_length = string.scan(/#{ascii_pat}/on).join.length
315
+ jis_escseq_count = string.scan(/#{jis_pat}/on).size
316
+ eucjp_match_length = string.scan(/#{eucjp_pat}/no).join.length
317
+ sjis_match_length = string.scan(/#{sjis_pat}/no).join.length
318
+ utf8_match_length = string.scan(/#{utf8_pat}/no).join.length
319
+
320
+ case
321
+ when 0 < jis_escseq_count # JIS escape sequense found
322
+ guessed_encoding = 'JIS'
323
+ when ascii_match_length == string.length # every char is ASCII (but not JIS)
324
+ guessed_encoding = 'US-ASCII'
325
+ else
326
+ case
327
+ when eucjp_match_length < (string.length / 2) &&
328
+ sjis_match_length < (string.length / 2) &&
329
+ utf8_match_length < (string.length / 2)
330
+ guessed_encoding = 'UNKNOWN' # either encoding did not match long enough
331
+ when (eucjp_match_length < utf8_match_length) &&
332
+ (sjis_match_length < utf8_match_length)
333
+ guessed_encoding = 'UTF-8'
334
+ when (eucjp_match_length < sjis_match_length) &&
335
+ (utf8_match_length < sjis_match_length)
336
+ guessed_encoding = 'Shift_JIS'
337
+ when (sjis_match_length < eucjp_match_length) &&
338
+ (utf8_match_length < eucjp_match_length)
339
+ guessed_encoding = 'EUC-JP'
340
+ else
341
+ guessed_encoding = 'UNKNOWN' # cannot guess at all
342
+ end
343
+ end
344
+ return guessed_encoding
345
+ end
346
+
347
+ def CharString.guess_encoding_using_iconv(string)
348
+ valid_as_utf8 = CharString.valid_as("utf-8", string)
349
+ valid_as_sjis = CharString.valid_as("cp932", string) # not sjis, but cp932
350
+ valid_as_jis = CharString.valid_as("iso-2022-jp", string)
351
+ valid_as_eucjp = CharString.valid_as("eucjp", string)
352
+ valid_as_ascii = CharString.valid_as("ascii", string)
353
+ invalid_as_utf8 = CharString.invalid_as("utf-8", string)
354
+ invalid_as_sjis = CharString.invalid_as("cp932", string) # not sjis, but cp932
355
+ invalid_as_jis = CharString.invalid_as("iso-2022-jp", string)
356
+ invalid_as_eucjp = CharString.invalid_as("eucjp", string)
357
+ invalid_as_ascii = CharString.invalid_as("ascii", string)
358
+ case
359
+ when string == nil
360
+ nil
361
+ when valid_as_ascii
362
+ "US-ASCII"
363
+ when valid_as_jis # Iconv sometimes recognizes JIS for ASCII, ignoring JIS escape sequence.
364
+ "JIS"
365
+ when valid_as_eucjp
366
+ "EUC-JP"
367
+ when valid_as_sjis && invalid_as_utf8 && invalid_as_eucjp && invalid_as_jis
368
+ "Shift_JIS"
369
+ when valid_as_utf8 && invalid_as_sjis && invalid_as_eucjp && invalid_as_jis
370
+ "UTF-8"
371
+ else
372
+ "UNKNOWN"
373
+ end
374
+ end
375
+
376
+ def CharString.valid_as(encoding_name, string)
377
+ begin
378
+ Iconv.iconv(encoding_name, encoding_name, string)
379
+ rescue Iconv::IllegalSequence, Iconv::InvalidCharacter, Iconv::OutOfRange
380
+ return false
381
+ else
382
+ return true
383
+ end
384
+ end
385
+
386
+ def CharString.invalid_as(encoding_name, string)
387
+ if CharString.valid_as(encoding_name, string)
388
+ false
389
+ else
390
+ true
391
+ end
392
+ end
393
+
394
+ def split_to_byte()
395
+ scan(/./nm)
396
+ end
397
+
398
+ def split_to_char()
399
+ raise "Encodings[encoding] is #{Encodings[encoding].inspect}: encoding not specified or auto-detection failed." unless Encodings[encoding]
400
+ # raise "EOLChars[eol] is #{EOLChars[eol].inspect}: eol not specified or auto-detection failed." unless EOLChars[eol]
401
+ if eol_char # sometimes string has no end-of-line char
402
+ scan(Regexp.new("(?:#{eol_char})|(?:.)",
403
+ Regexp::MULTILINE,
404
+ encoding.sub(/ASCII/i, 'none'))
405
+ )
406
+ else # it seems that no EOL module was extended...
407
+ scan(Regexp.new("(?:.)",
408
+ Regexp::MULTILINE,
409
+ encoding.sub(/ASCII/i, 'none'))
410
+ )
411
+ end
412
+ end
413
+
414
+ def count_latin_graph_char()
415
+ raise "Encodings[encoding] is #{Encodings[encoding].inspect}: encoding not specified or auto-detection failed." unless Encodings[encoding]
416
+ # raise "EOLChars[eol] is #{EOLChars[eol].inspect}: eol not specified or auto-detection failed." unless EOLChars[eol]
417
+ scan(Regexp.new("[#{Encodings[encoding]::GRAPH}]",
418
+ Regexp::MULTILINE,
419
+ encoding.sub(/ASCII/i, 'none'))
420
+ ).size
421
+ end
422
+
423
+ def count_ja_graph_char()
424
+ raise "Encodings[encoding] is #{Encodings[encoding].inspect}: encoding not specified or auto-detection failed." unless Encodings[encoding]
425
+ # raise "EOLChars[eol] is #{EOLChars[eol].inspect}: eol not specified or auto-detection failed." unless EOLChars[eol]
426
+ scan(Regexp.new("[#{Encodings[encoding]::JA_GRAPH}]",
427
+ Regexp::MULTILINE,
428
+ encoding.sub(/ASCII/i, 'none'))
429
+ ).size
430
+ end
431
+
432
+ def count_latin_blank_char()
433
+ scan(Regexp.new("[#{Encodings[encoding]::BLANK}]",
434
+ Regexp::MULTILINE,
435
+ encoding.sub(/ASCII/i, 'none'))
436
+ ).size
437
+ end
438
+
439
+ def count_ja_blank_char()
440
+ scan(Regexp.new("[#{Encodings[encoding]::JA_BLANK}]",
441
+ Regexp::MULTILINE,
442
+ encoding.sub(/ASCII/i, 'none'))
443
+ ).size
444
+ end
445
+
446
+ def split_to_word()
447
+ raise "Encodings[encoding] is #{Encodings[encoding].inspect}: encoding not specified or auto-detection failed." unless Encodings[encoding]
448
+ # raise "EOLChars[eol] is #{EOLChars[eol].inspect}: eol not specified or auto-detection failed." unless EOLChars[eol]
449
+ scan(Regexp.new(Encodings[encoding]::WORD_REGEXP_SRC,
450
+ Regexp::MULTILINE,
451
+ encoding.sub(/ASCII/i, 'none'))
452
+ )
453
+ end
454
+
455
+ def count_latin_word()
456
+ split_to_word.collect{|word|
457
+ word if Regexp.new("[#{Encodings[encoding]::PRINT}]",
458
+ Regexp::MULTILINE,
459
+ encoding.sub(/ASCII/i, 'none')).match word
460
+ }.compact.size
461
+ end
462
+
463
+ def count_ja_word()
464
+ split_to_word.collect{|word|
465
+ word if Regexp.new("[#{Encodings[encoding]::JA_PRINT}]",
466
+ Regexp::MULTILINE,
467
+ encoding.sub(/ASCII/i, 'none')).match word
468
+ }.compact.size
469
+ end
470
+
471
+ def count_latin_valid_word()
472
+ split_to_word.collect{|word|
473
+ word if Regexp.new("[#{Encodings[encoding]::ALNUM}]",
474
+ Regexp::MULTILINE,
475
+ encoding.sub(/ASCII/i, 'none')).match word
476
+ }.compact.size
477
+ end
478
+
479
+ def count_ja_valid_word()
480
+ split_to_word.collect{|word|
481
+ word if Regexp.new("[#{Encodings[encoding]::JA_GRAPH}]",
482
+ Regexp::MULTILINE,
483
+ encoding.sub(/ASCII/i, 'none')).match word
484
+ }.compact.size
485
+ end
486
+
487
+ def split_to_line()
488
+ # scan(Regexp.new(".*?#{eol_char}|.+",
489
+ # Regexp::MULTILINE,
490
+ # encoding.sub(/ASCII/i, 'none'))
491
+ # )
492
+ raise "Encodings[encoding] is #{Encodings[encoding].inspect}: encoding not specified or auto-detection failed." unless Encodings[encoding]
493
+ raise "EOLChars[eol] is #{EOLChars[eol].inspect}: eol not specified or auto-detection failed." unless EOLChars[eol]
494
+ if defined? eol_char
495
+ scan(Regexp.new(".*?#{eol_char}|.+",
496
+ Regexp::MULTILINE,
497
+ encoding.sub(/ASCII/i, 'none'))
498
+ )
499
+ else
500
+ scan(Regexp.new(".+",
501
+ Regexp::MULTILINE,
502
+ encoding.sub(/ASCII/i, 'none'))
503
+ )
504
+ end
505
+ end
506
+
507
+ def count_graph_line()
508
+ split_to_line.collect{|line|
509
+ line if Regexp.new("[#{Encodings[encoding]::GRAPH}" +
510
+ "#{Encodings[encoding]::JA_GRAPH}]",
511
+ Regexp::MULTILINE,
512
+ encoding.sub(/ASCII/, 'none')).match line
513
+ }.compact.size
514
+ end
515
+
516
+ def count_blank_line()
517
+ split_to_line.collect{|line|
518
+ line if Regexp.new("^[#{Encodings[encoding]::BLANK}" +
519
+ "#{Encodings[encoding]::JA_BLANK}]+(?:#{eol_char})?",
520
+ Regexp::MULTILINE,
521
+ encoding.sub(/ASCII/, 'none')).match line
522
+ }.compact.size
523
+ end
524
+
525
+ # load encoding modules
526
+ require 'docdiff/encoding/en_ascii'
527
+ require 'docdiff/encoding/ja_eucjp'
528
+ require 'docdiff/encoding/ja_sjis'
529
+ require 'docdiff/encoding/ja_utf8'
530
+ end # end ruby_m17n?
531
+ alias to_bytes split_to_byte
532
+ alias to_chars split_to_char
533
+ alias to_words split_to_word
534
+ alias to_lines split_to_line
535
+
536
+ module CR
537
+ EOL = 'CR'
538
+
539
+ def eol_char()
540
+ "\r"
541
+ end
542
+
543
+ CharString.register_eol(self)
544
+ end
545
+
546
+ module LF
547
+ EOL = 'LF'
548
+
549
+ def eol_char()
550
+ "\n"
551
+ end
552
+
553
+ CharString.register_eol(self)
554
+ end
555
+
556
+ module CRLF
557
+ EOL = 'CRLF'
558
+
559
+ def eol_char()
560
+ "\r\n"
561
+ end
562
+
563
+ CharString.register_eol(self)
564
+ end
565
+
566
+ module NoEOL
567
+ EOL = 'NONE'
568
+ def eol_char()
569
+ nil
570
+ end
571
+
572
+ CharString.register_eol(self)
573
+ end
574
+
575
+ end # module CharString
576
+
577
+ # class String
578
+ # include CharString
579
+ # end