docdiff 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. data/.gitignore +6 -0
  2. data/.travis.yml +7 -0
  3. data/Gemfile +17 -0
  4. data/Guardfile +8 -0
  5. data/Makefile +108 -0
  6. data/Rakefile +17 -0
  7. data/bin/docdiff +179 -0
  8. data/devutil/JIS0208.TXT +6952 -0
  9. data/devutil/char_by_charclass.rb +23 -0
  10. data/devutil/charclass_by_char.rb +21 -0
  11. data/devutil/jis0208.rb +343 -0
  12. data/devutil/testjis0208.rb +38 -0
  13. data/docdiff.conf.example +22 -0
  14. data/docdiff.gemspec +23 -0
  15. data/docdiffwebui.cgi +176 -0
  16. data/docdiffwebui.html +123 -0
  17. data/img/docdiff-screenshot-format-html-digest-firefox.png +0 -0
  18. data/img/docdiff-screenshot-format-html-firefox.png +0 -0
  19. data/img/docdiff-screenshot-format-tty-cmdexe-en.png +0 -0
  20. data/img/docdiff-screenshot-format-tty-cmdexe-ja.png +0 -0
  21. data/img/docdiff-screenshot-format-tty-rxvtunicode-en.png +0 -0
  22. data/img/docdiff-screenshot-format-tty-rxvtunicode-ja.png +0 -0
  23. data/img/docdiff-screenshot-format-tty-xterm-en.png +0 -0
  24. data/img/docdiff-screenshot-format-tty-xterm-ja.png +0 -0
  25. data/img/docdiff-screenshot-resolution-linewordchar-xterm.png +0 -0
  26. data/index.html +181 -0
  27. data/langfilter.rb +14 -0
  28. data/lib/doc_diff.rb +170 -0
  29. data/lib/docdiff.rb +7 -0
  30. data/lib/docdiff/charstring.rb +579 -0
  31. data/lib/docdiff/diff.rb +217 -0
  32. data/lib/docdiff/diff/contours.rb +382 -0
  33. data/lib/docdiff/diff/editscript.rb +148 -0
  34. data/lib/docdiff/diff/rcsdiff.rb +107 -0
  35. data/lib/docdiff/diff/shortestpath.rb +93 -0
  36. data/lib/docdiff/diff/speculative.rb +40 -0
  37. data/lib/docdiff/diff/subsequence.rb +39 -0
  38. data/lib/docdiff/diff/unidiff.rb +124 -0
  39. data/lib/docdiff/difference.rb +92 -0
  40. data/lib/docdiff/document.rb +127 -0
  41. data/lib/docdiff/encoding/en_ascii.rb +97 -0
  42. data/lib/docdiff/encoding/ja_eucjp.rb +269 -0
  43. data/lib/docdiff/encoding/ja_sjis.rb +260 -0
  44. data/lib/docdiff/encoding/ja_utf8.rb +6974 -0
  45. data/lib/docdiff/version.rb +3 -0
  46. data/lib/docdiff/view.rb +476 -0
  47. data/lib/viewdiff.rb +375 -0
  48. data/readme.html +713 -0
  49. data/sample/01.en.ascii.cr +1 -0
  50. data/sample/01.en.ascii.crlf +2 -0
  51. data/sample/01.en.ascii.lf +2 -0
  52. data/sample/01.ja.eucjp.lf +2 -0
  53. data/sample/01.ja.sjis.cr +1 -0
  54. data/sample/01.ja.sjis.crlf +2 -0
  55. data/sample/01.ja.utf8.crlf +2 -0
  56. data/sample/02.en.ascii.cr +1 -0
  57. data/sample/02.en.ascii.crlf +2 -0
  58. data/sample/02.en.ascii.lf +2 -0
  59. data/sample/02.ja.eucjp.lf +2 -0
  60. data/sample/02.ja.sjis.cr +1 -0
  61. data/sample/02.ja.sjis.crlf +2 -0
  62. data/sample/02.ja.utf8.crlf +2 -0
  63. data/sample/humpty_dumpty01.ascii.lf +4 -0
  64. data/sample/humpty_dumpty02.ascii.lf +4 -0
  65. data/test/charstring_test.rb +1008 -0
  66. data/test/diff_test.rb +36 -0
  67. data/test/difference_test.rb +64 -0
  68. data/test/docdiff_test.rb +193 -0
  69. data/test/document_test.rb +626 -0
  70. data/test/test_helper.rb +7 -0
  71. data/test/view_test.rb +570 -0
  72. data/test/viewdiff_test.rb +908 -0
  73. metadata +129 -0
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/ruby
2
+ # language filter
3
+ # usage: langfilter.rb --en <infile >outfile
4
+
5
+ def ruby_m17n?
6
+ return true if "".respond_to? :encoding
7
+ end
8
+
9
+ lang_to_include = ARGV.shift.gsub(/-+/, "")
10
+ lang_to_exclude = {"en"=>"ja", "ja"=>"en"}[lang_to_include]
11
+ re = /<([a-z]+) +(?:(?:lang|title)="#{lang_to_exclude}").*?>.*?<\/\1>[\r\n]?/m
12
+
13
+ ARGF.set_encoding("UTF-8") if ruby_m17n?
14
+ ARGF.read.gsub(re, "").display
@@ -0,0 +1,170 @@
1
+ # DocDiff: word/character-oriented text comparison utility
2
+ # Copyright (C) 2002-2011 Hisashi MORITA
3
+ # Requirements: Ruby (>= 1.8)
4
+ class DocDiff
5
+
6
+ AppVersion = Docdiff::VERSION
7
+ Author = "Copyright (C) 2002-2011 Hisashi MORITA.\n" +
8
+ "diff library originates from Ruby/CVS by TANAKA Akira.\n"
9
+ License = "This software is licensed under so-called modified BSD license.\n" +
10
+ "See the document for detail.\n"
11
+ SystemConfigFileName = File.join(File::Separator, "etc", "docdiff", "docdiff.conf")
12
+ UserConfigFileName = File.join(ENV['HOME'], "etc", "docdiff", "docdiff.conf")
13
+ AltUserConfigFileName = File.join(ENV['HOME'], ".docdiff", "docdiff.conf")
14
+
15
+ def initialize()
16
+ @config = {}
17
+ end
18
+ attr_accessor :config
19
+
20
+ def DocDiff.parse_config_file_content(content)
21
+ result = {}
22
+ return result if content.size <= 0
23
+ lines = content.dup.split(/\r\n|\r|\n/).compact
24
+ lines.collect!{|line| line.sub(/#.*$/, '')}
25
+ lines.collect!{|line| line.strip}
26
+ lines.delete_if{|line| line == ""}
27
+ lines.each{|line|
28
+ raise 'line does not include " = ".' unless /[\s]+=[\s]+/.match line
29
+ name_src, value_src = line.split(/[\s]+=[\s]+/)
30
+ raise "Invalid name: #{name_src.inspect}" if (/\s/.match name_src)
31
+ raise "Invalid value: #{value_src.inspect}" unless value_src.kind_of?(String)
32
+ name = name_src.intern
33
+ value = value_src
34
+ value = true if ['on','yes','true'].include? value_src.downcase
35
+ value = false if ['off','no','false'].include? value_src.downcase
36
+ value = value_src.to_i if /^[0-9]+$/.match value_src
37
+ result[name] = value
38
+ }
39
+ result
40
+ end
41
+
42
+ def compare_by_line(doc1, doc2)
43
+ Difference.new(doc1.split_to_line, doc2.split_to_line)
44
+ end
45
+
46
+ def compare_by_line_word(doc1, doc2)
47
+ lines = compare_by_line(doc1, doc2)
48
+ words = Difference.new
49
+ lines.each{|line|
50
+ if line.first == :change_elt
51
+ before_change = Document.new(line[1].join,
52
+ doc1.encoding, doc1.eol)
53
+ after_change = Document.new(line[2].join,
54
+ doc2.encoding, doc2.eol)
55
+ Difference.new(before_change.split_to_word,
56
+ after_change.split_to_word).each{|word|
57
+ words << word
58
+ }
59
+ else # :common_elt_elt, :del_elt, or :add_elt
60
+ words << line
61
+ end
62
+ }
63
+ words
64
+ end
65
+
66
+ # i know this implementation of recursion is so lame...
67
+ def compare_by_line_word_char(doc1, doc2)
68
+ lines = compare_by_line(doc1, doc2)
69
+ lines_and_words = Difference.new
70
+ lines.each{|line|
71
+ if line.first == :change_elt
72
+ before_change = Document.new(line[1].join,
73
+ doc1.encoding, doc1.eol)
74
+ after_change = Document.new(line[2].join,
75
+ doc2.encoding, doc2.eol)
76
+ Difference.new(before_change.split_to_word,
77
+ after_change.split_to_word).each{|word|
78
+ lines_and_words << word
79
+ }
80
+ else # :common_elt_elt, :del_elt, or :add_elt
81
+ lines_and_words << line
82
+ end
83
+ }
84
+ lines_words_and_chars = Difference.new
85
+ lines_and_words.each{|line_or_word|
86
+ if line_or_word.first == :change_elt
87
+ before_change = Document.new(line_or_word[1].join, doc1.encoding, doc1.eol)
88
+ after_change = Document.new(line_or_word[2].join, doc2.encoding, doc2.eol)
89
+ Difference.new(before_change.split_to_char, after_change.split_to_char).each{|char|
90
+ lines_words_and_chars << char
91
+ }
92
+ else # :common_elt_elt, :del_elt, or :add_elt
93
+ lines_words_and_chars << line_or_word
94
+ end
95
+ }
96
+ lines_words_and_chars
97
+ end
98
+
99
+ def run(doc1, doc2, option)
100
+ raise "option is nil" if option.nil?
101
+ raise "option[:resolution] is nil" if option[:resolution].nil?
102
+ raise "option[:format] is nil" if option[:format].nil?
103
+ case
104
+ when doc1.class == Document && doc2.class == Document # OK
105
+ when doc1.encoding != nil && doc2.encoding != nil # OK
106
+ when doc1.encoding == doc2.encoding && doc1.eol == doc2.eol # OK
107
+ else
108
+ raise("Error! Blame the author (doc1: #{doc1.encoding}, #{doc1.eol}, doc2: #{doc2.encoding}, #{doc2.eol}).")
109
+ end
110
+
111
+ case option[:resolution]
112
+ when "line"; then difference = compare_by_line(doc1, doc2)
113
+ when "word"; then difference = compare_by_line_word(doc1, doc2)
114
+ when "char"; then difference = compare_by_line_word_char(doc1, doc2)
115
+ else
116
+ raise "Unsupported resolution: #{option[:resolution].inspect}"
117
+ end
118
+ view = View.new(difference, doc1.encoding, doc1.eol)
119
+ user_tags = {:start_common => (@config[:tag_common_start] ||= ''),
120
+ :end_common => (@config[:tag_common_end] ||= ''),
121
+ :start_del => (@config[:tag_del_start] ||= ''),
122
+ :end_del => (@config[:tag_del_end] ||= ''),
123
+ :start_add => (@config[:tag_add_start] ||= ''),
124
+ :end_add => (@config[:tag_add_end] ||= ''),
125
+ :start_before_change => (@config[:tag_change_before_start] ||= ''),
126
+ :end_before_change => (@config[:tag_change_before_end] ||= ''),
127
+ :start_after_change => (@config[:tag_change_after_start] ||= ''),
128
+ :end_after_change => (@config[:tag_change_after_end] ||= '')}
129
+ case option[:digest]
130
+ when true
131
+ case option[:format]
132
+ when "tty"; then result = view.to_tty_digest(option)
133
+ when "html"; then result = view.to_html_digest(option)
134
+ when "manued"; then result = view.to_manued_digest(option)
135
+ when "wdiff"; then result = view.to_wdiff_digest(option)
136
+ when "stat"; then result = view.to_stat(option)
137
+ when "user"; then result = view.to_user_digest(user_tags)
138
+ else
139
+ raise "Unsupported output format: #{option[:format].inspect}."
140
+ end
141
+ when false
142
+ case option[:format]
143
+ when "tty"; then result = view.to_tty(option)
144
+ when "html"; then result = view.to_html(option)
145
+ when "manued"; then result = view.to_manued(option)
146
+ when "wdiff"; then result = view.to_wdiff(option)
147
+ when "stat"; then result = view.to_stat(option)
148
+ when "user"; then result = view.to_user(user_tags)
149
+ else
150
+ raise "Unsupported output format: #{option[:format].inspect}."
151
+ end
152
+ end
153
+ result.join
154
+ end
155
+
156
+ def process_config_file(filename)
157
+ file_content = nil
158
+ begin
159
+ File.open(filename, "r"){|f| file_content = f.read}
160
+ rescue Errno::ENOENT
161
+ message = "config file not found so not read."
162
+ ensure
163
+ if file_content != nil
164
+ self.config.update(DocDiff.parse_config_file_content(file_content))
165
+ end
166
+ end
167
+ message
168
+ end
169
+
170
+ end # class DocDiff
@@ -0,0 +1,7 @@
1
+ # DocDiff: word/character-oriented text comparison utility
2
+ # Copyright (C) 2002-2011 Hisashi MORITA
3
+ # Requirements: Ruby (>= 1.8)
4
+ require 'docdiff/version'
5
+ require 'doc_diff'
6
+ module Docdiff
7
+ end
@@ -0,0 +1,579 @@
1
+ #!/usr/bin/ruby
2
+ # Character String module.
3
+ # To use, include to String, or extend String.
4
+ # 2003- Hisashi MORITA
5
+
6
+ module CharString
7
+
8
+ Encodings = {}
9
+ EOLChars = {} # End-of-line characters, such as CR, LF, CRLF.
10
+
11
+ def initialize(string)
12
+ =begin unnecessary
13
+ # @encoding = CharString.guess_encoding(string)
14
+ # @eol = CharString.guess_eol(string)
15
+ =end unnecessary
16
+ super
17
+ end
18
+
19
+ def eol()
20
+ @eol
21
+ # if @eol
22
+ # @eol
23
+ # else
24
+ # @eol = CharString.guess_eol(self)
25
+ # # raise "eol is not set.\n"
26
+ # end
27
+ end
28
+
29
+ def eol=(e)
30
+ @eol = e
31
+ extend EOLChars[@eol]
32
+ end
33
+
34
+ def eol_char()
35
+ if @eol_char
36
+ @eol_char
37
+ else
38
+ nil
39
+ # extend EOLChars[eol]
40
+ # eol_char
41
+ end
42
+ end
43
+
44
+ def debug()
45
+ case
46
+ when @encoding == nil
47
+ raise "@encoding is nil."
48
+ when Encodings[@encoding] == nil
49
+ raise "Encodings[@encoding(=#{@encoding})] is nil."
50
+ when Encodings[@encoding].class != Module
51
+ raise "Encodings[@encoding].class(=#{Encodings[@encoding].class}) is not a module."
52
+ when @eol == nil
53
+ raise "@eol is nil."
54
+ when EOLChars[@eol] == nil
55
+ raise "EOLChars[@eol(=#{@eol})] is nil."
56
+ else
57
+ # should I do some alert?
58
+ end
59
+ ["id: #{self.id}, class: #{self.class}, self: #{self}, ",
60
+ "module: #{Encodings[@encoding]}, #{EOLChars[@eol]}"].join
61
+ end
62
+
63
+ def CharString.register_encoding(mod)
64
+ Encodings[mod::Encoding] = mod
65
+ end
66
+
67
+ def CharString.register_eol(mod)
68
+ EOLChars[mod::EOL] = mod
69
+ end
70
+
71
+ def CharString.guess_eol(string)
72
+ # returns 'CR', 'LF', 'CRLF', 'UNKNOWN'(binary),
73
+ # 'NONE'(1-line), or nil
74
+ return nil if string == nil #=> nil (argument missing)
75
+ eol_counts = {'CR' => string.scan(/(\r)(?!\n)/o).size,
76
+ 'LF' => string.scan(/(?:\A|[^\r])(\n)/o).size,
77
+ 'CRLF' => string.scan(/(\r\n)/o).size}
78
+ eol_counts.delete_if{|eol, count| count == 0} # Remove missing EOL
79
+ eols = eol_counts.keys
80
+ eol_variety = eols.size # numbers of flavors found
81
+ if eol_variety == 1 # Only one type of EOL found
82
+ return eols[0] #=> 'CR', 'LF', or 'CRLF'
83
+ elsif eol_variety == 0 # No EOL found
84
+ return 'NONE' #=> 'NONE' (might be 1-line file)
85
+ else # Multiple types of EOL found
86
+ return 'UNKNOWN' #=> 'UNKNOWN' (might be binary data)
87
+ end
88
+ end
89
+
90
+ def CharString.ruby_m17n?
91
+ "".respond_to?(:force_encoding)
92
+ end
93
+
94
+ # Note that some languages (like Japanese) do not have 'word' or 'phrase',
95
+ # thus some of the following methods are not 'linguistically correct'.
96
+
97
+ def count_byte()
98
+ split_to_byte().size
99
+ end
100
+
101
+ def count_char() # eol = 1 char
102
+ split_to_char().size
103
+ end
104
+
105
+ def count_graph_char()
106
+ count_latin_graph_char() + count_ja_graph_char()
107
+ end
108
+
109
+ def count_blank_char()
110
+ count_latin_blank_char() + count_ja_blank_char()
111
+ end
112
+
113
+ def count_word()
114
+ split_to_word().size
115
+ end
116
+
117
+ def count_valid_word()
118
+ count_latin_valid_word() + count_ja_valid_word()
119
+ end
120
+
121
+ def count_line() # this is common to all encodings.
122
+ split_to_line.size
123
+ end
124
+
125
+ def count_empty_line()
126
+ split_to_line.collect{|line|
127
+ line if /^(?:#{eol_char})|^$/m.match line
128
+ }.compact.size
129
+ end
130
+
131
+ if ruby_m17n?
132
+ # for Ruby-1.9
133
+ def encoding()
134
+ String.new(self).encoding.to_s
135
+ end
136
+
137
+ def encoding=(cs)
138
+ force_encoding(cs) if self
139
+ end
140
+
141
+ def CharString.guess_encoding(string)
142
+ if string
143
+ string.encoding.to_s
144
+ else
145
+ nil
146
+ end
147
+ end
148
+
149
+ def split_to_byte()
150
+ encode("ASCII-8BIT").scan(/./nm)
151
+ end
152
+
153
+ def split_to_char()
154
+ if eol_char # sometimes string has no end-of-line char
155
+ encode('UTF-8').scan(Regexp.new("(?:#{eol_char})|(?:.)",
156
+ Regexp::MULTILINE)
157
+ ).map{|e| e.encode(self.encoding)}
158
+ else # it seems that no EOL module was extended...
159
+ encode('UTF-8').scan(Regexp.new("(?:.)",
160
+ Regexp::MULTILINE)
161
+ ).map{|e| e.encode(self.encoding)}
162
+ end
163
+ end
164
+
165
+ def count_latin_graph_char()
166
+ encode('UTF-8').scan(Regexp.new("[#{Encodings['UTF-8']::GRAPH}]",
167
+ Regexp::MULTILINE)
168
+ ).size
169
+ end
170
+
171
+ def count_ja_graph_char()
172
+ encode('UTF-8').scan(Regexp.new("[#{Encodings['UTF-8']::JA_GRAPH}]",
173
+ Regexp::MULTILINE)
174
+ ).size
175
+ end
176
+
177
+ def count_latin_blank_char()
178
+ encode('UTF-8').scan(Regexp.new("[#{Encodings['UTF-8']::BLANK}]",
179
+ Regexp::MULTILINE)
180
+ ).size
181
+ end
182
+
183
+ def count_ja_blank_char()
184
+ encode('UTF-8').scan(Regexp.new("[#{Encodings['UTF-8']::JA_BLANK}]",
185
+ Regexp::MULTILINE)
186
+ ).size
187
+ end
188
+
189
+ def split_to_word()
190
+ encode('UTF-8').scan(Regexp.new(Encodings['UTF-8']::WORD_REGEXP_SRC,
191
+ Regexp::MULTILINE)
192
+ ).map{|e| e.encode(self.encoding)}
193
+ end
194
+
195
+ def count_latin_word()
196
+ split_to_word.collect{|word|
197
+ word if Regexp.new("[#{Encodings['UTF-8']::PRINT}]",
198
+ Regexp::MULTILINE).match word.encode('UTF-8')
199
+ }.compact.size
200
+ end
201
+
202
+ def count_ja_word()
203
+ split_to_word.collect{|word|
204
+ word if Regexp.new("[#{Encodings['UTF-8']::JA_PRINT}]",
205
+ Regexp::MULTILINE).match word.encode('UTF-8')
206
+ }.compact.size
207
+ end
208
+
209
+ def count_latin_valid_word()
210
+ split_to_word.collect{|word|
211
+ word if Regexp.new("[#{Encodings['UTF-8']::ALNUM}]",
212
+ Regexp::MULTILINE).match word.encode('UTF-8')
213
+ }.compact.size
214
+ end
215
+
216
+ def count_ja_valid_word()
217
+ split_to_word.collect{|word|
218
+ word if Regexp.new("[#{Encodings['UTF-8']::JA_GRAPH}]",
219
+ Regexp::MULTILINE).match word.encode('UTF-8')
220
+ }.compact.size
221
+ end
222
+
223
+ def split_to_line()
224
+ raise "EOLChars[eol] is #{EOLChars[eol].inspect}: eol not specified or auto-detection failed." unless EOLChars[eol]
225
+ if defined? eol_char
226
+ encode('UTF-8').scan(Regexp.new(".*?#{eol_char}|.+",
227
+ Regexp::MULTILINE)
228
+ ).map{|e| e.encode(self.encoding)}
229
+ else
230
+ encode('UTF-8').scan(Regexp.new(".+",
231
+ Regexp::MULTILINE)
232
+ ).map{|e| e.encode(self.encoding)}
233
+ end
234
+ end
235
+
236
+ def count_graph_line()
237
+ split_to_line.collect{|line|
238
+ line if Regexp.new("[#{Encodings['UTF-8']::GRAPH}" +
239
+ "#{Encodings['UTF-8']::JA_GRAPH}]",
240
+ Regexp::MULTILINE).match line.encode('UTF-8')
241
+ }.compact.size
242
+ end
243
+
244
+ def count_blank_line()
245
+ split_to_line.collect{|line|
246
+ line if Regexp.new("^[#{Encodings['UTF-8']::BLANK}" +
247
+ "#{Encodings['UTF-8']::JA_BLANK}]+(?:#{eol_char})?",
248
+ Regexp::MULTILINE).match line.encode('UTF-8')
249
+ }.compact.size
250
+ end
251
+
252
+ # load encoding modules
253
+ require 'docdiff/encoding/en_ascii'
254
+ require 'docdiff/encoding/ja_eucjp'
255
+ require 'docdiff/encoding/ja_sjis'
256
+ require 'docdiff/encoding/ja_utf8'
257
+ else
258
+ # for Ruby-1.8
259
+ require 'iconv'
260
+
261
+ def encoding()
262
+ @encoding
263
+ # if @encoding
264
+ # @encoding
265
+ # else
266
+ # @encoding = CharString.guess_encoding(self)
267
+ # # raise "encoding is not set.\n"
268
+ # end
269
+ end
270
+
271
+ def encoding=(cs)
272
+ @encoding = cs
273
+ extend Encodings[@encoding] # ; p "Hey, I extended #{Encodings[@encoding]}!"
274
+ end
275
+
276
+ # returns nil, 'US-ASCII', 'JIS', 'EUC-JP', 'Shift_JIS', 'UTF-8', or 'UNKNOWN'
277
+ def CharString.guess_encoding(string)
278
+ return nil if string == nil
279
+ result_using_pureruby = CharString.guess_encoding_using_pureruby(string)
280
+ result_using_iconv = CharString.guess_encoding_using_iconv(string)
281
+ if result_using_pureruby == result_using_iconv
282
+ result_using_pureruby
283
+ else
284
+ "UNKNOWN"
285
+ end
286
+ end
287
+
288
+ # returns nil, 'US-ASCII', 'JIS', 'EUC-JP', 'Shift_JIS', 'UTF-8', or 'UNKNOWN'
289
+ def CharString.guess_encoding_using_pureruby(string)
290
+ return nil if string == nil
291
+
292
+ ascii_pat = '[\x00-\x7f]'
293
+ jis_pat = ['(?:(?:\x1b\x28\x42)',
294
+ '|(?:\x1b\x28\x4a)',
295
+ '|(?:\x1b\x28\x49)',
296
+ '|(?:\x1b\x24\x40)',
297
+ '|(?:\x1b\x24\x42)',
298
+ '|(?:\x1b\x24\x44))'].join
299
+ eucjp_pat = ['(?:(?:[\x00-\x1f\x7f])',
300
+ '|(?:[\x20-\x7e])',
301
+ '|(?:\x8e[\xa1-\xdf])',
302
+ '|(?:[\xa1-\xfe][\xa1-\xfe])',
303
+ '|(?:\x8f[\xa1-\xfe][\xa1-\xfe]))'].join
304
+ sjis_pat = ['(?:(?:[\x00-\x1f\x7f])',
305
+ '|(?:[\x20-\x7e])',
306
+ '|(?:[\xa1-\xdf])',
307
+ '|(?:[\x81-\x9f][\x40-\x7e])',
308
+ '|(?:[\xe0-\xef][\x80-\xfc]))'].join
309
+ utf8_pat = ['(?:(?:[\x00-\x7f])',
310
+ '|(?:[\xc0-\xdf][\x80-\xbf])',
311
+ '|(?:[\xe0-\xef][\x80-\xbf][\x80-\xbf])',
312
+ '|(?:[\xf0-\xf7][\x80-\xbf][\x80-\xbf][\x80-\xbf]))'].join
313
+
314
+ ascii_match_length = string.scan(/#{ascii_pat}/on).join.length
315
+ jis_escseq_count = string.scan(/#{jis_pat}/on).size
316
+ eucjp_match_length = string.scan(/#{eucjp_pat}/no).join.length
317
+ sjis_match_length = string.scan(/#{sjis_pat}/no).join.length
318
+ utf8_match_length = string.scan(/#{utf8_pat}/no).join.length
319
+
320
+ case
321
+ when 0 < jis_escseq_count # JIS escape sequense found
322
+ guessed_encoding = 'JIS'
323
+ when ascii_match_length == string.length # every char is ASCII (but not JIS)
324
+ guessed_encoding = 'US-ASCII'
325
+ else
326
+ case
327
+ when eucjp_match_length < (string.length / 2) &&
328
+ sjis_match_length < (string.length / 2) &&
329
+ utf8_match_length < (string.length / 2)
330
+ guessed_encoding = 'UNKNOWN' # either encoding did not match long enough
331
+ when (eucjp_match_length < utf8_match_length) &&
332
+ (sjis_match_length < utf8_match_length)
333
+ guessed_encoding = 'UTF-8'
334
+ when (eucjp_match_length < sjis_match_length) &&
335
+ (utf8_match_length < sjis_match_length)
336
+ guessed_encoding = 'Shift_JIS'
337
+ when (sjis_match_length < eucjp_match_length) &&
338
+ (utf8_match_length < eucjp_match_length)
339
+ guessed_encoding = 'EUC-JP'
340
+ else
341
+ guessed_encoding = 'UNKNOWN' # cannot guess at all
342
+ end
343
+ end
344
+ return guessed_encoding
345
+ end
346
+
347
+ def CharString.guess_encoding_using_iconv(string)
348
+ valid_as_utf8 = CharString.valid_as("utf-8", string)
349
+ valid_as_sjis = CharString.valid_as("cp932", string) # not sjis, but cp932
350
+ valid_as_jis = CharString.valid_as("iso-2022-jp", string)
351
+ valid_as_eucjp = CharString.valid_as("eucjp", string)
352
+ valid_as_ascii = CharString.valid_as("ascii", string)
353
+ invalid_as_utf8 = CharString.invalid_as("utf-8", string)
354
+ invalid_as_sjis = CharString.invalid_as("cp932", string) # not sjis, but cp932
355
+ invalid_as_jis = CharString.invalid_as("iso-2022-jp", string)
356
+ invalid_as_eucjp = CharString.invalid_as("eucjp", string)
357
+ invalid_as_ascii = CharString.invalid_as("ascii", string)
358
+ case
359
+ when string == nil
360
+ nil
361
+ when valid_as_ascii
362
+ "US-ASCII"
363
+ when valid_as_jis # Iconv sometimes recognizes JIS for ASCII, ignoring JIS escape sequence.
364
+ "JIS"
365
+ when valid_as_eucjp
366
+ "EUC-JP"
367
+ when valid_as_sjis && invalid_as_utf8 && invalid_as_eucjp && invalid_as_jis
368
+ "Shift_JIS"
369
+ when valid_as_utf8 && invalid_as_sjis && invalid_as_eucjp && invalid_as_jis
370
+ "UTF-8"
371
+ else
372
+ "UNKNOWN"
373
+ end
374
+ end
375
+
376
+ def CharString.valid_as(encoding_name, string)
377
+ begin
378
+ Iconv.iconv(encoding_name, encoding_name, string)
379
+ rescue Iconv::IllegalSequence, Iconv::InvalidCharacter, Iconv::OutOfRange
380
+ return false
381
+ else
382
+ return true
383
+ end
384
+ end
385
+
386
+ def CharString.invalid_as(encoding_name, string)
387
+ if CharString.valid_as(encoding_name, string)
388
+ false
389
+ else
390
+ true
391
+ end
392
+ end
393
+
394
+ def split_to_byte()
395
+ scan(/./nm)
396
+ end
397
+
398
+ def split_to_char()
399
+ raise "Encodings[encoding] is #{Encodings[encoding].inspect}: encoding not specified or auto-detection failed." unless Encodings[encoding]
400
+ # raise "EOLChars[eol] is #{EOLChars[eol].inspect}: eol not specified or auto-detection failed." unless EOLChars[eol]
401
+ if eol_char # sometimes string has no end-of-line char
402
+ scan(Regexp.new("(?:#{eol_char})|(?:.)",
403
+ Regexp::MULTILINE,
404
+ encoding.sub(/ASCII/i, 'none'))
405
+ )
406
+ else # it seems that no EOL module was extended...
407
+ scan(Regexp.new("(?:.)",
408
+ Regexp::MULTILINE,
409
+ encoding.sub(/ASCII/i, 'none'))
410
+ )
411
+ end
412
+ end
413
+
414
+ def count_latin_graph_char()
415
+ raise "Encodings[encoding] is #{Encodings[encoding].inspect}: encoding not specified or auto-detection failed." unless Encodings[encoding]
416
+ # raise "EOLChars[eol] is #{EOLChars[eol].inspect}: eol not specified or auto-detection failed." unless EOLChars[eol]
417
+ scan(Regexp.new("[#{Encodings[encoding]::GRAPH}]",
418
+ Regexp::MULTILINE,
419
+ encoding.sub(/ASCII/i, 'none'))
420
+ ).size
421
+ end
422
+
423
+ def count_ja_graph_char()
424
+ raise "Encodings[encoding] is #{Encodings[encoding].inspect}: encoding not specified or auto-detection failed." unless Encodings[encoding]
425
+ # raise "EOLChars[eol] is #{EOLChars[eol].inspect}: eol not specified or auto-detection failed." unless EOLChars[eol]
426
+ scan(Regexp.new("[#{Encodings[encoding]::JA_GRAPH}]",
427
+ Regexp::MULTILINE,
428
+ encoding.sub(/ASCII/i, 'none'))
429
+ ).size
430
+ end
431
+
432
+ def count_latin_blank_char()
433
+ scan(Regexp.new("[#{Encodings[encoding]::BLANK}]",
434
+ Regexp::MULTILINE,
435
+ encoding.sub(/ASCII/i, 'none'))
436
+ ).size
437
+ end
438
+
439
+ def count_ja_blank_char()
440
+ scan(Regexp.new("[#{Encodings[encoding]::JA_BLANK}]",
441
+ Regexp::MULTILINE,
442
+ encoding.sub(/ASCII/i, 'none'))
443
+ ).size
444
+ end
445
+
446
+ def split_to_word()
447
+ raise "Encodings[encoding] is #{Encodings[encoding].inspect}: encoding not specified or auto-detection failed." unless Encodings[encoding]
448
+ # raise "EOLChars[eol] is #{EOLChars[eol].inspect}: eol not specified or auto-detection failed." unless EOLChars[eol]
449
+ scan(Regexp.new(Encodings[encoding]::WORD_REGEXP_SRC,
450
+ Regexp::MULTILINE,
451
+ encoding.sub(/ASCII/i, 'none'))
452
+ )
453
+ end
454
+
455
+ def count_latin_word()
456
+ split_to_word.collect{|word|
457
+ word if Regexp.new("[#{Encodings[encoding]::PRINT}]",
458
+ Regexp::MULTILINE,
459
+ encoding.sub(/ASCII/i, 'none')).match word
460
+ }.compact.size
461
+ end
462
+
463
+ def count_ja_word()
464
+ split_to_word.collect{|word|
465
+ word if Regexp.new("[#{Encodings[encoding]::JA_PRINT}]",
466
+ Regexp::MULTILINE,
467
+ encoding.sub(/ASCII/i, 'none')).match word
468
+ }.compact.size
469
+ end
470
+
471
+ def count_latin_valid_word()
472
+ split_to_word.collect{|word|
473
+ word if Regexp.new("[#{Encodings[encoding]::ALNUM}]",
474
+ Regexp::MULTILINE,
475
+ encoding.sub(/ASCII/i, 'none')).match word
476
+ }.compact.size
477
+ end
478
+
479
+ def count_ja_valid_word()
480
+ split_to_word.collect{|word|
481
+ word if Regexp.new("[#{Encodings[encoding]::JA_GRAPH}]",
482
+ Regexp::MULTILINE,
483
+ encoding.sub(/ASCII/i, 'none')).match word
484
+ }.compact.size
485
+ end
486
+
487
+ def split_to_line()
488
+ # scan(Regexp.new(".*?#{eol_char}|.+",
489
+ # Regexp::MULTILINE,
490
+ # encoding.sub(/ASCII/i, 'none'))
491
+ # )
492
+ raise "Encodings[encoding] is #{Encodings[encoding].inspect}: encoding not specified or auto-detection failed." unless Encodings[encoding]
493
+ raise "EOLChars[eol] is #{EOLChars[eol].inspect}: eol not specified or auto-detection failed." unless EOLChars[eol]
494
+ if defined? eol_char
495
+ scan(Regexp.new(".*?#{eol_char}|.+",
496
+ Regexp::MULTILINE,
497
+ encoding.sub(/ASCII/i, 'none'))
498
+ )
499
+ else
500
+ scan(Regexp.new(".+",
501
+ Regexp::MULTILINE,
502
+ encoding.sub(/ASCII/i, 'none'))
503
+ )
504
+ end
505
+ end
506
+
507
+ def count_graph_line()
508
+ split_to_line.collect{|line|
509
+ line if Regexp.new("[#{Encodings[encoding]::GRAPH}" +
510
+ "#{Encodings[encoding]::JA_GRAPH}]",
511
+ Regexp::MULTILINE,
512
+ encoding.sub(/ASCII/, 'none')).match line
513
+ }.compact.size
514
+ end
515
+
516
+ def count_blank_line()
517
+ split_to_line.collect{|line|
518
+ line if Regexp.new("^[#{Encodings[encoding]::BLANK}" +
519
+ "#{Encodings[encoding]::JA_BLANK}]+(?:#{eol_char})?",
520
+ Regexp::MULTILINE,
521
+ encoding.sub(/ASCII/, 'none')).match line
522
+ }.compact.size
523
+ end
524
+
525
+ # load encoding modules
526
+ require 'docdiff/encoding/en_ascii'
527
+ require 'docdiff/encoding/ja_eucjp'
528
+ require 'docdiff/encoding/ja_sjis'
529
+ require 'docdiff/encoding/ja_utf8'
530
+ end # end ruby_m17n?
531
+ alias to_bytes split_to_byte
532
+ alias to_chars split_to_char
533
+ alias to_words split_to_word
534
+ alias to_lines split_to_line
535
+
536
+ module CR
537
+ EOL = 'CR'
538
+
539
+ def eol_char()
540
+ "\r"
541
+ end
542
+
543
+ CharString.register_eol(self)
544
+ end
545
+
546
+ module LF
547
+ EOL = 'LF'
548
+
549
+ def eol_char()
550
+ "\n"
551
+ end
552
+
553
+ CharString.register_eol(self)
554
+ end
555
+
556
+ module CRLF
557
+ EOL = 'CRLF'
558
+
559
+ def eol_char()
560
+ "\r\n"
561
+ end
562
+
563
+ CharString.register_eol(self)
564
+ end
565
+
566
+ module NoEOL
567
+ EOL = 'NONE'
568
+ def eol_char()
569
+ nil
570
+ end
571
+
572
+ CharString.register_eol(self)
573
+ end
574
+
575
+ end # module CharString
576
+
577
+ # class String
578
+ # include CharString
579
+ # end