docdiff 0.5.0 → 0.6.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.travis.yml +5 -3
- data/Gemfile +1 -1
- data/Makefile +15 -19
- data/Rakefile +45 -10
- data/bin/docdiff +25 -13
- data/devutil/Rakefile +9 -0
- data/devutil/changelog.sh +40 -0
- data/docdiff.gemspec +4 -4
- data/docdiffwebui.cgi +1 -1
- data/langfilter.rb +1 -5
- data/lib/doc_diff.rb +5 -1
- data/lib/docdiff/charstring.rb +10 -285
- data/lib/docdiff/diff/contours.rb +2 -1
- data/lib/docdiff/diff/editscript.rb +2 -0
- data/lib/docdiff/diff/rcsdiff.rb +2 -0
- data/lib/docdiff/diff/shortestpath.rb +2 -0
- data/lib/docdiff/diff/speculative.rb +6 -3
- data/lib/docdiff/diff/subsequence.rb +2 -0
- data/lib/docdiff/diff/unidiff.rb +2 -1
- data/lib/docdiff/diff.rb +2 -0
- data/lib/docdiff/difference.rb +2 -0
- data/lib/docdiff/document.rb +2 -0
- data/lib/docdiff/encoding/en_ascii.rb +15 -40
- data/lib/docdiff/encoding/ja_eucjp.rb +15 -40
- data/lib/docdiff/encoding/ja_sjis.rb +15 -40
- data/lib/docdiff/encoding/ja_utf8.rb +15 -40
- data/lib/docdiff/version.rb +1 -1
- data/lib/docdiff/view.rb +16 -14
- data/lib/docdiff.rb +1 -1
- data/readme.html +41 -4
- data/readme.md +185 -0
- data/test/charstring_test.rb +16 -26
- data/test/diff_test.rb +2 -1
- data/test/difference_test.rb +2 -1
- data/test/docdiff_test.rb +12 -3
- data/test/document_test.rb +7 -6
- data/test/view_test.rb +3 -1
- metadata +23 -34
- data/devutil/JIS0208.TXT +0 -6952
- data/lib/viewdiff.rb +0 -375
- data/test/viewdiff_test.rb +0 -908
data/lib/docdiff/charstring.rb
CHANGED
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
# To use, include to String, or extend String.
|
|
4
4
|
# 2003- Hisashi MORITA
|
|
5
5
|
|
|
6
|
+
class DocDiff
|
|
6
7
|
module CharString
|
|
7
8
|
|
|
8
9
|
Encodings = {}
|
|
@@ -72,9 +73,10 @@ module CharString
|
|
|
72
73
|
# returns 'CR', 'LF', 'CRLF', 'UNKNOWN'(binary),
|
|
73
74
|
# 'NONE'(1-line), or nil
|
|
74
75
|
return nil if string == nil #=> nil (argument missing)
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
'
|
|
76
|
+
bin_string = string.dup.force_encoding("ASCII-8BIT")
|
|
77
|
+
eol_counts = {'CR' => bin_string.scan(/(\r)(?!\n)/o).size,
|
|
78
|
+
'LF' => bin_string.scan(/(?:\A|[^\r])(\n)/o).size,
|
|
79
|
+
'CRLF' => bin_string.scan(/(\r\n)/o).size}
|
|
78
80
|
eol_counts.delete_if{|eol, count| count == 0} # Remove missing EOL
|
|
79
81
|
eols = eol_counts.keys
|
|
80
82
|
eol_variety = eols.size # numbers of flavors found
|
|
@@ -87,10 +89,6 @@ module CharString
|
|
|
87
89
|
end
|
|
88
90
|
end
|
|
89
91
|
|
|
90
|
-
def CharString.ruby_m17n?
|
|
91
|
-
"".respond_to?(:force_encoding)
|
|
92
|
-
end
|
|
93
|
-
|
|
94
92
|
# Note that some languages (like Japanese) do not have 'word' or 'phrase',
|
|
95
93
|
# thus some of the following methods are not 'linguistically correct'.
|
|
96
94
|
|
|
@@ -128,7 +126,6 @@ module CharString
|
|
|
128
126
|
}.compact.size
|
|
129
127
|
end
|
|
130
128
|
|
|
131
|
-
if ruby_m17n?
|
|
132
129
|
# for Ruby-1.9
|
|
133
130
|
def encoding()
|
|
134
131
|
String.new(self).encoding.to_s
|
|
@@ -234,10 +231,11 @@ if ruby_m17n?
|
|
|
234
231
|
end
|
|
235
232
|
|
|
236
233
|
def count_graph_line()
|
|
234
|
+
graph = (Encodings['UTF-8']::GRAPH +
|
|
235
|
+
Encodings['UTF-8']::JA_GRAPH).chars.uniq.join
|
|
236
|
+
re_graph = Regexp.new("[#{Regexp.quote(graph)}]", Regexp::MULTILINE)
|
|
237
237
|
split_to_line.collect{|line|
|
|
238
|
-
line if
|
|
239
|
-
"#{Encodings['UTF-8']::JA_GRAPH}]",
|
|
240
|
-
Regexp::MULTILINE).match line.encode('UTF-8')
|
|
238
|
+
line if re_graph.match line.encode('UTF-8')
|
|
241
239
|
}.compact.size
|
|
242
240
|
end
|
|
243
241
|
|
|
@@ -254,280 +252,6 @@ if ruby_m17n?
|
|
|
254
252
|
require 'docdiff/encoding/ja_eucjp'
|
|
255
253
|
require 'docdiff/encoding/ja_sjis'
|
|
256
254
|
require 'docdiff/encoding/ja_utf8'
|
|
257
|
-
else
|
|
258
|
-
# for Ruby-1.8
|
|
259
|
-
require 'iconv'
|
|
260
|
-
|
|
261
|
-
def encoding()
|
|
262
|
-
@encoding
|
|
263
|
-
# if @encoding
|
|
264
|
-
# @encoding
|
|
265
|
-
# else
|
|
266
|
-
# @encoding = CharString.guess_encoding(self)
|
|
267
|
-
# # raise "encoding is not set.\n"
|
|
268
|
-
# end
|
|
269
|
-
end
|
|
270
|
-
|
|
271
|
-
def encoding=(cs)
|
|
272
|
-
@encoding = cs
|
|
273
|
-
extend Encodings[@encoding] # ; p "Hey, I extended #{Encodings[@encoding]}!"
|
|
274
|
-
end
|
|
275
|
-
|
|
276
|
-
# returns nil, 'US-ASCII', 'JIS', 'EUC-JP', 'Shift_JIS', 'UTF-8', or 'UNKNOWN'
|
|
277
|
-
def CharString.guess_encoding(string)
|
|
278
|
-
return nil if string == nil
|
|
279
|
-
result_using_pureruby = CharString.guess_encoding_using_pureruby(string)
|
|
280
|
-
result_using_iconv = CharString.guess_encoding_using_iconv(string)
|
|
281
|
-
if result_using_pureruby == result_using_iconv
|
|
282
|
-
result_using_pureruby
|
|
283
|
-
else
|
|
284
|
-
"UNKNOWN"
|
|
285
|
-
end
|
|
286
|
-
end
|
|
287
|
-
|
|
288
|
-
# returns nil, 'US-ASCII', 'JIS', 'EUC-JP', 'Shift_JIS', 'UTF-8', or 'UNKNOWN'
|
|
289
|
-
def CharString.guess_encoding_using_pureruby(string)
|
|
290
|
-
return nil if string == nil
|
|
291
|
-
|
|
292
|
-
ascii_pat = '[\x00-\x7f]'
|
|
293
|
-
jis_pat = ['(?:(?:\x1b\x28\x42)',
|
|
294
|
-
'|(?:\x1b\x28\x4a)',
|
|
295
|
-
'|(?:\x1b\x28\x49)',
|
|
296
|
-
'|(?:\x1b\x24\x40)',
|
|
297
|
-
'|(?:\x1b\x24\x42)',
|
|
298
|
-
'|(?:\x1b\x24\x44))'].join
|
|
299
|
-
eucjp_pat = ['(?:(?:[\x00-\x1f\x7f])',
|
|
300
|
-
'|(?:[\x20-\x7e])',
|
|
301
|
-
'|(?:\x8e[\xa1-\xdf])',
|
|
302
|
-
'|(?:[\xa1-\xfe][\xa1-\xfe])',
|
|
303
|
-
'|(?:\x8f[\xa1-\xfe][\xa1-\xfe]))'].join
|
|
304
|
-
sjis_pat = ['(?:(?:[\x00-\x1f\x7f])',
|
|
305
|
-
'|(?:[\x20-\x7e])',
|
|
306
|
-
'|(?:[\xa1-\xdf])',
|
|
307
|
-
'|(?:[\x81-\x9f][\x40-\x7e])',
|
|
308
|
-
'|(?:[\xe0-\xef][\x80-\xfc]))'].join
|
|
309
|
-
utf8_pat = ['(?:(?:[\x00-\x7f])',
|
|
310
|
-
'|(?:[\xc0-\xdf][\x80-\xbf])',
|
|
311
|
-
'|(?:[\xe0-\xef][\x80-\xbf][\x80-\xbf])',
|
|
312
|
-
'|(?:[\xf0-\xf7][\x80-\xbf][\x80-\xbf][\x80-\xbf]))'].join
|
|
313
|
-
|
|
314
|
-
ascii_match_length = string.scan(/#{ascii_pat}/on).join.length
|
|
315
|
-
jis_escseq_count = string.scan(/#{jis_pat}/on).size
|
|
316
|
-
eucjp_match_length = string.scan(/#{eucjp_pat}/no).join.length
|
|
317
|
-
sjis_match_length = string.scan(/#{sjis_pat}/no).join.length
|
|
318
|
-
utf8_match_length = string.scan(/#{utf8_pat}/no).join.length
|
|
319
|
-
|
|
320
|
-
case
|
|
321
|
-
when 0 < jis_escseq_count # JIS escape sequense found
|
|
322
|
-
guessed_encoding = 'JIS'
|
|
323
|
-
when ascii_match_length == string.length # every char is ASCII (but not JIS)
|
|
324
|
-
guessed_encoding = 'US-ASCII'
|
|
325
|
-
else
|
|
326
|
-
case
|
|
327
|
-
when eucjp_match_length < (string.length / 2) &&
|
|
328
|
-
sjis_match_length < (string.length / 2) &&
|
|
329
|
-
utf8_match_length < (string.length / 2)
|
|
330
|
-
guessed_encoding = 'UNKNOWN' # either encoding did not match long enough
|
|
331
|
-
when (eucjp_match_length < utf8_match_length) &&
|
|
332
|
-
(sjis_match_length < utf8_match_length)
|
|
333
|
-
guessed_encoding = 'UTF-8'
|
|
334
|
-
when (eucjp_match_length < sjis_match_length) &&
|
|
335
|
-
(utf8_match_length < sjis_match_length)
|
|
336
|
-
guessed_encoding = 'Shift_JIS'
|
|
337
|
-
when (sjis_match_length < eucjp_match_length) &&
|
|
338
|
-
(utf8_match_length < eucjp_match_length)
|
|
339
|
-
guessed_encoding = 'EUC-JP'
|
|
340
|
-
else
|
|
341
|
-
guessed_encoding = 'UNKNOWN' # cannot guess at all
|
|
342
|
-
end
|
|
343
|
-
end
|
|
344
|
-
return guessed_encoding
|
|
345
|
-
end
|
|
346
|
-
|
|
347
|
-
def CharString.guess_encoding_using_iconv(string)
|
|
348
|
-
valid_as_utf8 = CharString.valid_as("utf-8", string)
|
|
349
|
-
valid_as_sjis = CharString.valid_as("cp932", string) # not sjis, but cp932
|
|
350
|
-
valid_as_jis = CharString.valid_as("iso-2022-jp", string)
|
|
351
|
-
valid_as_eucjp = CharString.valid_as("eucjp", string)
|
|
352
|
-
valid_as_ascii = CharString.valid_as("ascii", string)
|
|
353
|
-
invalid_as_utf8 = CharString.invalid_as("utf-8", string)
|
|
354
|
-
invalid_as_sjis = CharString.invalid_as("cp932", string) # not sjis, but cp932
|
|
355
|
-
invalid_as_jis = CharString.invalid_as("iso-2022-jp", string)
|
|
356
|
-
invalid_as_eucjp = CharString.invalid_as("eucjp", string)
|
|
357
|
-
invalid_as_ascii = CharString.invalid_as("ascii", string)
|
|
358
|
-
case
|
|
359
|
-
when string == nil
|
|
360
|
-
nil
|
|
361
|
-
when valid_as_ascii
|
|
362
|
-
"US-ASCII"
|
|
363
|
-
when valid_as_jis # Iconv sometimes recognizes JIS for ASCII, ignoring JIS escape sequence.
|
|
364
|
-
"JIS"
|
|
365
|
-
when valid_as_eucjp
|
|
366
|
-
"EUC-JP"
|
|
367
|
-
when valid_as_sjis && invalid_as_utf8 && invalid_as_eucjp && invalid_as_jis
|
|
368
|
-
"Shift_JIS"
|
|
369
|
-
when valid_as_utf8 && invalid_as_sjis && invalid_as_eucjp && invalid_as_jis
|
|
370
|
-
"UTF-8"
|
|
371
|
-
else
|
|
372
|
-
"UNKNOWN"
|
|
373
|
-
end
|
|
374
|
-
end
|
|
375
|
-
|
|
376
|
-
def CharString.valid_as(encoding_name, string)
|
|
377
|
-
begin
|
|
378
|
-
Iconv.iconv(encoding_name, encoding_name, string)
|
|
379
|
-
rescue Iconv::IllegalSequence, Iconv::InvalidCharacter, Iconv::OutOfRange
|
|
380
|
-
return false
|
|
381
|
-
else
|
|
382
|
-
return true
|
|
383
|
-
end
|
|
384
|
-
end
|
|
385
|
-
|
|
386
|
-
def CharString.invalid_as(encoding_name, string)
|
|
387
|
-
if CharString.valid_as(encoding_name, string)
|
|
388
|
-
false
|
|
389
|
-
else
|
|
390
|
-
true
|
|
391
|
-
end
|
|
392
|
-
end
|
|
393
|
-
|
|
394
|
-
def split_to_byte()
|
|
395
|
-
scan(/./nm)
|
|
396
|
-
end
|
|
397
|
-
|
|
398
|
-
def split_to_char()
|
|
399
|
-
raise "Encodings[encoding] is #{Encodings[encoding].inspect}: encoding not specified or auto-detection failed." unless Encodings[encoding]
|
|
400
|
-
# raise "EOLChars[eol] is #{EOLChars[eol].inspect}: eol not specified or auto-detection failed." unless EOLChars[eol]
|
|
401
|
-
if eol_char # sometimes string has no end-of-line char
|
|
402
|
-
scan(Regexp.new("(?:#{eol_char})|(?:.)",
|
|
403
|
-
Regexp::MULTILINE,
|
|
404
|
-
encoding.sub(/ASCII/i, 'none'))
|
|
405
|
-
)
|
|
406
|
-
else # it seems that no EOL module was extended...
|
|
407
|
-
scan(Regexp.new("(?:.)",
|
|
408
|
-
Regexp::MULTILINE,
|
|
409
|
-
encoding.sub(/ASCII/i, 'none'))
|
|
410
|
-
)
|
|
411
|
-
end
|
|
412
|
-
end
|
|
413
|
-
|
|
414
|
-
def count_latin_graph_char()
|
|
415
|
-
raise "Encodings[encoding] is #{Encodings[encoding].inspect}: encoding not specified or auto-detection failed." unless Encodings[encoding]
|
|
416
|
-
# raise "EOLChars[eol] is #{EOLChars[eol].inspect}: eol not specified or auto-detection failed." unless EOLChars[eol]
|
|
417
|
-
scan(Regexp.new("[#{Encodings[encoding]::GRAPH}]",
|
|
418
|
-
Regexp::MULTILINE,
|
|
419
|
-
encoding.sub(/ASCII/i, 'none'))
|
|
420
|
-
).size
|
|
421
|
-
end
|
|
422
|
-
|
|
423
|
-
def count_ja_graph_char()
|
|
424
|
-
raise "Encodings[encoding] is #{Encodings[encoding].inspect}: encoding not specified or auto-detection failed." unless Encodings[encoding]
|
|
425
|
-
# raise "EOLChars[eol] is #{EOLChars[eol].inspect}: eol not specified or auto-detection failed." unless EOLChars[eol]
|
|
426
|
-
scan(Regexp.new("[#{Encodings[encoding]::JA_GRAPH}]",
|
|
427
|
-
Regexp::MULTILINE,
|
|
428
|
-
encoding.sub(/ASCII/i, 'none'))
|
|
429
|
-
).size
|
|
430
|
-
end
|
|
431
|
-
|
|
432
|
-
def count_latin_blank_char()
|
|
433
|
-
scan(Regexp.new("[#{Encodings[encoding]::BLANK}]",
|
|
434
|
-
Regexp::MULTILINE,
|
|
435
|
-
encoding.sub(/ASCII/i, 'none'))
|
|
436
|
-
).size
|
|
437
|
-
end
|
|
438
|
-
|
|
439
|
-
def count_ja_blank_char()
|
|
440
|
-
scan(Regexp.new("[#{Encodings[encoding]::JA_BLANK}]",
|
|
441
|
-
Regexp::MULTILINE,
|
|
442
|
-
encoding.sub(/ASCII/i, 'none'))
|
|
443
|
-
).size
|
|
444
|
-
end
|
|
445
|
-
|
|
446
|
-
def split_to_word()
|
|
447
|
-
raise "Encodings[encoding] is #{Encodings[encoding].inspect}: encoding not specified or auto-detection failed." unless Encodings[encoding]
|
|
448
|
-
# raise "EOLChars[eol] is #{EOLChars[eol].inspect}: eol not specified or auto-detection failed." unless EOLChars[eol]
|
|
449
|
-
scan(Regexp.new(Encodings[encoding]::WORD_REGEXP_SRC,
|
|
450
|
-
Regexp::MULTILINE,
|
|
451
|
-
encoding.sub(/ASCII/i, 'none'))
|
|
452
|
-
)
|
|
453
|
-
end
|
|
454
|
-
|
|
455
|
-
def count_latin_word()
|
|
456
|
-
split_to_word.collect{|word|
|
|
457
|
-
word if Regexp.new("[#{Encodings[encoding]::PRINT}]",
|
|
458
|
-
Regexp::MULTILINE,
|
|
459
|
-
encoding.sub(/ASCII/i, 'none')).match word
|
|
460
|
-
}.compact.size
|
|
461
|
-
end
|
|
462
|
-
|
|
463
|
-
def count_ja_word()
|
|
464
|
-
split_to_word.collect{|word|
|
|
465
|
-
word if Regexp.new("[#{Encodings[encoding]::JA_PRINT}]",
|
|
466
|
-
Regexp::MULTILINE,
|
|
467
|
-
encoding.sub(/ASCII/i, 'none')).match word
|
|
468
|
-
}.compact.size
|
|
469
|
-
end
|
|
470
|
-
|
|
471
|
-
def count_latin_valid_word()
|
|
472
|
-
split_to_word.collect{|word|
|
|
473
|
-
word if Regexp.new("[#{Encodings[encoding]::ALNUM}]",
|
|
474
|
-
Regexp::MULTILINE,
|
|
475
|
-
encoding.sub(/ASCII/i, 'none')).match word
|
|
476
|
-
}.compact.size
|
|
477
|
-
end
|
|
478
|
-
|
|
479
|
-
def count_ja_valid_word()
|
|
480
|
-
split_to_word.collect{|word|
|
|
481
|
-
word if Regexp.new("[#{Encodings[encoding]::JA_GRAPH}]",
|
|
482
|
-
Regexp::MULTILINE,
|
|
483
|
-
encoding.sub(/ASCII/i, 'none')).match word
|
|
484
|
-
}.compact.size
|
|
485
|
-
end
|
|
486
|
-
|
|
487
|
-
def split_to_line()
|
|
488
|
-
# scan(Regexp.new(".*?#{eol_char}|.+",
|
|
489
|
-
# Regexp::MULTILINE,
|
|
490
|
-
# encoding.sub(/ASCII/i, 'none'))
|
|
491
|
-
# )
|
|
492
|
-
raise "Encodings[encoding] is #{Encodings[encoding].inspect}: encoding not specified or auto-detection failed." unless Encodings[encoding]
|
|
493
|
-
raise "EOLChars[eol] is #{EOLChars[eol].inspect}: eol not specified or auto-detection failed." unless EOLChars[eol]
|
|
494
|
-
if defined? eol_char
|
|
495
|
-
scan(Regexp.new(".*?#{eol_char}|.+",
|
|
496
|
-
Regexp::MULTILINE,
|
|
497
|
-
encoding.sub(/ASCII/i, 'none'))
|
|
498
|
-
)
|
|
499
|
-
else
|
|
500
|
-
scan(Regexp.new(".+",
|
|
501
|
-
Regexp::MULTILINE,
|
|
502
|
-
encoding.sub(/ASCII/i, 'none'))
|
|
503
|
-
)
|
|
504
|
-
end
|
|
505
|
-
end
|
|
506
|
-
|
|
507
|
-
def count_graph_line()
|
|
508
|
-
split_to_line.collect{|line|
|
|
509
|
-
line if Regexp.new("[#{Encodings[encoding]::GRAPH}" +
|
|
510
|
-
"#{Encodings[encoding]::JA_GRAPH}]",
|
|
511
|
-
Regexp::MULTILINE,
|
|
512
|
-
encoding.sub(/ASCII/, 'none')).match line
|
|
513
|
-
}.compact.size
|
|
514
|
-
end
|
|
515
|
-
|
|
516
|
-
def count_blank_line()
|
|
517
|
-
split_to_line.collect{|line|
|
|
518
|
-
line if Regexp.new("^[#{Encodings[encoding]::BLANK}" +
|
|
519
|
-
"#{Encodings[encoding]::JA_BLANK}]+(?:#{eol_char})?",
|
|
520
|
-
Regexp::MULTILINE,
|
|
521
|
-
encoding.sub(/ASCII/, 'none')).match line
|
|
522
|
-
}.compact.size
|
|
523
|
-
end
|
|
524
|
-
|
|
525
|
-
# load encoding modules
|
|
526
|
-
require 'docdiff/encoding/en_ascii'
|
|
527
|
-
require 'docdiff/encoding/ja_eucjp'
|
|
528
|
-
require 'docdiff/encoding/ja_sjis'
|
|
529
|
-
require 'docdiff/encoding/ja_utf8'
|
|
530
|
-
end # end ruby_m17n?
|
|
531
255
|
alias to_bytes split_to_byte
|
|
532
256
|
alias to_chars split_to_char
|
|
533
257
|
alias to_words split_to_word
|
|
@@ -573,6 +297,7 @@ end # end ruby_m17n?
|
|
|
573
297
|
end
|
|
574
298
|
|
|
575
299
|
end # module CharString
|
|
300
|
+
end # class DocDiff
|
|
576
301
|
|
|
577
302
|
# class String
|
|
578
303
|
# include CharString
|
|
@@ -46,6 +46,7 @@ Also in Nordic Journal of Computing (NJC), Vol. 2, No. 4, Winter 1995, 444 - 461
|
|
|
46
46
|
http://web.informatik.uni-bonn.de/IV/Mitarbeiter/rick/lcs.dvi.Z
|
|
47
47
|
=end
|
|
48
48
|
|
|
49
|
+
class DocDiff
|
|
49
50
|
class Diff
|
|
50
51
|
class Contours
|
|
51
52
|
def initialize(a, b)
|
|
@@ -379,4 +380,4 @@ class Diff
|
|
|
379
380
|
end
|
|
380
381
|
end
|
|
381
382
|
end
|
|
382
|
-
|
|
383
|
+
end # class DocDiff
|
data/lib/docdiff/diff/rcsdiff.rb
CHANGED
|
@@ -2,6 +2,7 @@ require 'docdiff/diff/shortestpath'
|
|
|
2
2
|
require 'docdiff/diff/contours'
|
|
3
3
|
require 'thread'
|
|
4
4
|
|
|
5
|
+
class DocDiff
|
|
5
6
|
class Diff
|
|
6
7
|
class Speculative
|
|
7
8
|
def initialize(a, b)
|
|
@@ -14,21 +15,22 @@ class Diff
|
|
|
14
15
|
result = nil
|
|
15
16
|
|
|
16
17
|
tg = ThreadGroup.new
|
|
18
|
+
m = Mutex.new
|
|
17
19
|
|
|
18
20
|
# Since ShortestPath is faster than Contours if two sequences are very similar,
|
|
19
21
|
# try it first.
|
|
20
22
|
tg.add(Thread.new {
|
|
21
23
|
#print "ShortestPath start.\n"
|
|
22
24
|
result = ShortestPath.new(@a, @b).lcs
|
|
23
|
-
|
|
25
|
+
m.synchronize {tg.list.each {|t| t.kill if t != Thread.current}}
|
|
24
26
|
#print "ShortestPath win.\n"
|
|
25
27
|
})
|
|
26
28
|
|
|
27
|
-
# start Contours unless ShortestPath is already ended with first quantum,
|
|
29
|
+
# start Contours unless ShortestPath is already ended with first quantum,
|
|
28
30
|
tg.add(Thread.new {
|
|
29
31
|
#print "Contours start.\n"
|
|
30
32
|
result = Contours.new(@a, @b).lcs
|
|
31
|
-
|
|
33
|
+
m.synchronize {tg.list.each {|t| t.kill if t != Thread.current}}
|
|
32
34
|
#print "Contours win.\n"
|
|
33
35
|
}) unless tg.list.empty?
|
|
34
36
|
|
|
@@ -38,3 +40,4 @@ class Diff
|
|
|
38
40
|
end
|
|
39
41
|
end
|
|
40
42
|
end
|
|
43
|
+
end # class DocDiff
|
data/lib/docdiff/diff/unidiff.rb
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
class DocDiff
|
|
1
2
|
class Diff
|
|
2
3
|
def Diff.unidiff(a, b, algorithm=nil)
|
|
3
4
|
al = []
|
|
@@ -19,7 +20,6 @@ class Diff
|
|
|
19
20
|
end
|
|
20
21
|
|
|
21
22
|
def unidiff(out='', context_lines=3)
|
|
22
|
-
state = :common
|
|
23
23
|
l1 = l2 = 1
|
|
24
24
|
hunk = []
|
|
25
25
|
hunk_l1 = hunk_l2 = 1
|
|
@@ -122,3 +122,4 @@ class Diff
|
|
|
122
122
|
end
|
|
123
123
|
end
|
|
124
124
|
end
|
|
125
|
+
end # class DocDiff
|
data/lib/docdiff/diff.rb
CHANGED
|
@@ -50,6 +50,7 @@ So, reduced input has following properties:
|
|
|
50
50
|
* Any elemnt in B is also exist in A.
|
|
51
51
|
|
|
52
52
|
=end
|
|
53
|
+
class DocDiff
|
|
53
54
|
class Diff
|
|
54
55
|
def initialize(a, b)
|
|
55
56
|
@original_a = a
|
|
@@ -215,3 +216,4 @@ class Diff
|
|
|
215
216
|
end
|
|
216
217
|
end
|
|
217
218
|
end
|
|
219
|
+
end # class DocDiff
|
data/lib/docdiff/difference.rb
CHANGED
data/lib/docdiff/document.rb
CHANGED
|
@@ -1,6 +1,9 @@
|
|
|
1
1
|
# English ASCII encoding module for CharString
|
|
2
2
|
# 2003- Hisashi MORITA
|
|
3
3
|
|
|
4
|
+
# frozen_string_literal: false
|
|
5
|
+
|
|
6
|
+
class DocDiff
|
|
4
7
|
module CharString
|
|
5
8
|
module ASCII
|
|
6
9
|
|
|
@@ -13,50 +16,21 @@ module CharString
|
|
|
13
16
|
SPACE = "\x09\x0a\x0b\x0c\x0d\x20"
|
|
14
17
|
BLANK = "\x09\x20"
|
|
15
18
|
DIGIT = "\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39"
|
|
16
|
-
|
|
17
|
-
"\x4b\x4c\x4d\x4e\x4f\x50\x51\x52\x53\x54" \
|
|
18
|
-
"\x55\x56\x57\x58\x59\x5a\x61\x62\x63\x64" \
|
|
19
|
-
"\x65\x66\x67\x68\x69\x6a\x6b\x6c\x6d\x6e" \
|
|
20
|
-
"\x6f\x70\x71\x72\x73\x74\x75\x76\x77\x78" \
|
|
21
|
-
"\x79\x7a"
|
|
22
|
-
ALNUM = "\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39" \
|
|
23
|
-
"\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4a" \
|
|
19
|
+
UPPER = "\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4a" \
|
|
24
20
|
"\x4b\x4c\x4d\x4e\x4f\x50\x51\x52\x53\x54" \
|
|
25
|
-
"\x55\x56\x57\x58\x59\x5a
|
|
26
|
-
|
|
27
|
-
"\x6f\x70\x71\x72\x73\x74
|
|
28
|
-
"\x79\x7a"
|
|
21
|
+
"\x55\x56\x57\x58\x59\x5a"
|
|
22
|
+
LOWER = "\x61\x62\x63\x64\x65\x66\x67\x68\x69\x6a" \
|
|
23
|
+
"\x6b\x6c\x6d\x6e\x6f\x70\x71\x72\x73\x74" \
|
|
24
|
+
"\x75\x76\x77\x78\x79\x7a"
|
|
25
|
+
ALPHA = UPPER + LOWER
|
|
26
|
+
ALNUM = DIGIT + ALPHA
|
|
29
27
|
PUNCT = "\x21\x22\x23\x24\x25\x26\x27\x28\x29\x2a" \
|
|
30
28
|
"\x2b\x2c\x2d\x2e\x2f\x3a\x3b\x3c\x3d\x3e" \
|
|
31
29
|
"\x3f\x40\x5b\x5c\x5d\x5e\x5f\x60\x7b\x7c" \
|
|
32
30
|
"\x7d\x7e"
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
UPPER = "\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4a" \
|
|
37
|
-
"\x4b\x4c\x4d\x4e\x4f\x50\x51\x52\x53\x54" \
|
|
38
|
-
"\x55\x56\x57\x58\x59\x5a"
|
|
39
|
-
PRINT = "\x20\x21\x22\x23\x24\x25\x26\x27\x28\x29" \
|
|
40
|
-
"\x2a\x2b\x2c\x2d\x2e\x2f\x30\x31\x32\x33" \
|
|
41
|
-
"\x34\x35\x36\x37\x38\x39\x3a\x3b\x3c\x3d" \
|
|
42
|
-
"\x3e\x3f\x40\x41\x42\x43\x44\x45\x46\x47" \
|
|
43
|
-
"\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f\x50\x51" \
|
|
44
|
-
"\x52\x53\x54\x55\x56\x57\x58\x59\x5a\x5b" \
|
|
45
|
-
"\x5c\x5d\x5e\x5f\x60\x61\x62\x63\x64\x65" \
|
|
46
|
-
"\x66\x67\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f" \
|
|
47
|
-
"\x70\x71\x72\x73\x74\x75\x76\x77\x78\x79" \
|
|
48
|
-
"\x7a\x7b\x7c\x7d\x7e"
|
|
49
|
-
GRAPH = "\x21\x22\x23\x24\x25\x26\x27\x28\x29\x2a" \
|
|
50
|
-
"\x2b\x2c\x2d\x2e\x2f\x30\x31\x32\x33\x34" \
|
|
51
|
-
"\x35\x36\x37\x38\x39\x3a\x3b\x3c\x3d\x3e" \
|
|
52
|
-
"\x3f\x40\x41\x42\x43\x44\x45\x46\x47\x48" \
|
|
53
|
-
"\x49\x4a\x4b\x4c\x4d\x4e\x4f\x50\x51\x52" \
|
|
54
|
-
"\x53\x54\x55\x56\x57\x58\x59\x5a\x5b\x5c" \
|
|
55
|
-
"\x5d\x5e\x5f\x60\x61\x62\x63\x64\x65\x66" \
|
|
56
|
-
"\x67\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f\x70" \
|
|
57
|
-
"\x71\x72\x73\x74\x75\x76\x77\x78\x79\x7a" \
|
|
58
|
-
"\x7b\x7c\x7d\x7e"
|
|
59
|
-
XDIGIT = "\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39" \
|
|
31
|
+
GRAPH = DIGIT + UPPER + LOWER + PUNCT
|
|
32
|
+
PRINT = "\x20" + GRAPH
|
|
33
|
+
XDIGIT = DIGIT +
|
|
60
34
|
"\x41\x42\x43\x44\x45\x46\x61\x62\x63\x64" \
|
|
61
35
|
"\x65\x66"
|
|
62
36
|
|
|
@@ -94,4 +68,5 @@ module CharString
|
|
|
94
68
|
CharString.register_encoding(self)
|
|
95
69
|
|
|
96
70
|
end # module ASCII
|
|
97
|
-
end
|
|
71
|
+
end # module CharString
|
|
72
|
+
end # class DocDiff
|
|
@@ -1,6 +1,9 @@
|
|
|
1
1
|
# Japanese EUC-JP encoding module for CharString
|
|
2
2
|
# 2003- Hisashi MORITA
|
|
3
3
|
|
|
4
|
+
# frozen_string_literal: false
|
|
5
|
+
|
|
6
|
+
class DocDiff
|
|
4
7
|
module CharString
|
|
5
8
|
module EUC_JP
|
|
6
9
|
|
|
@@ -16,50 +19,21 @@ module CharString
|
|
|
16
19
|
SPACE = "\x09\x0a\x0b\x0c\x0d\x20"
|
|
17
20
|
BLANK = "\x09\x20"
|
|
18
21
|
DIGIT = "\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39"
|
|
19
|
-
|
|
20
|
-
"\x4b\x4c\x4d\x4e\x4f\x50\x51\x52\x53\x54" \
|
|
21
|
-
"\x55\x56\x57\x58\x59\x5a\x61\x62\x63\x64" \
|
|
22
|
-
"\x65\x66\x67\x68\x69\x6a\x6b\x6c\x6d\x6e" \
|
|
23
|
-
"\x6f\x70\x71\x72\x73\x74\x75\x76\x77\x78" \
|
|
24
|
-
"\x79\x7a"
|
|
25
|
-
ALNUM = "\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39" \
|
|
26
|
-
"\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4a" \
|
|
22
|
+
UPPER = "\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4a" \
|
|
27
23
|
"\x4b\x4c\x4d\x4e\x4f\x50\x51\x52\x53\x54" \
|
|
28
|
-
"\x55\x56\x57\x58\x59\x5a
|
|
29
|
-
|
|
30
|
-
"\x6f\x70\x71\x72\x73\x74
|
|
31
|
-
"\x79\x7a"
|
|
24
|
+
"\x55\x56\x57\x58\x59\x5a"
|
|
25
|
+
LOWER = "\x61\x62\x63\x64\x65\x66\x67\x68\x69\x6a" \
|
|
26
|
+
"\x6b\x6c\x6d\x6e\x6f\x70\x71\x72\x73\x74" \
|
|
27
|
+
"\x75\x76\x77\x78\x79\x7a"
|
|
28
|
+
ALPHA = UPPER + LOWER
|
|
29
|
+
ALNUM = DIGIT + ALPHA
|
|
32
30
|
PUNCT = "\x21\x22\x23\x24\x25\x26\x27\x28\x29\x2a" \
|
|
33
31
|
"\x2b\x2c\x2d\x2e\x2f\x3a\x3b\x3c\x3d\x3e" \
|
|
34
32
|
"\x3f\x40\x5b\x5c\x5d\x5e\x5f\x60\x7b\x7c" \
|
|
35
33
|
"\x7d\x7e"
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
UPPER = "\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4a" \
|
|
40
|
-
"\x4b\x4c\x4d\x4e\x4f\x50\x51\x52\x53\x54" \
|
|
41
|
-
"\x55\x56\x57\x58\x59\x5a"
|
|
42
|
-
PRINT = "\x20\x21\x22\x23\x24\x25\x26\x27\x28\x29" \
|
|
43
|
-
"\x2a\x2b\x2c\x2d\x2e\x2f\x30\x31\x32\x33" \
|
|
44
|
-
"\x34\x35\x36\x37\x38\x39\x3a\x3b\x3c\x3d" \
|
|
45
|
-
"\x3e\x3f\x40\x41\x42\x43\x44\x45\x46\x47" \
|
|
46
|
-
"\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f\x50\x51" \
|
|
47
|
-
"\x52\x53\x54\x55\x56\x57\x58\x59\x5a\x5b" \
|
|
48
|
-
"\x5c\x5d\x5e\x5f\x60\x61\x62\x63\x64\x65" \
|
|
49
|
-
"\x66\x67\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f" \
|
|
50
|
-
"\x70\x71\x72\x73\x74\x75\x76\x77\x78\x79" \
|
|
51
|
-
"\x7a\x7b\x7c\x7d\x7e"
|
|
52
|
-
GRAPH = "\x21\x22\x23\x24\x25\x26\x27\x28\x29\x2a" \
|
|
53
|
-
"\x2b\x2c\x2d\x2e\x2f\x30\x31\x32\x33\x34" \
|
|
54
|
-
"\x35\x36\x37\x38\x39\x3a\x3b\x3c\x3d\x3e" \
|
|
55
|
-
"\x3f\x40\x41\x42\x43\x44\x45\x46\x47\x48" \
|
|
56
|
-
"\x49\x4a\x4b\x4c\x4d\x4e\x4f\x50\x51\x52" \
|
|
57
|
-
"\x53\x54\x55\x56\x57\x58\x59\x5a\x5b\x5c" \
|
|
58
|
-
"\x5d\x5e\x5f\x60\x61\x62\x63\x64\x65\x66" \
|
|
59
|
-
"\x67\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f\x70" \
|
|
60
|
-
"\x71\x72\x73\x74\x75\x76\x77\x78\x79\x7a" \
|
|
61
|
-
"\x7b\x7c\x7d\x7e"
|
|
62
|
-
XDIGIT = "\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39" \
|
|
34
|
+
GRAPH = DIGIT + UPPER + LOWER + PUNCT
|
|
35
|
+
PRINT = "\x20" + GRAPH
|
|
36
|
+
XDIGIT = DIGIT +
|
|
63
37
|
"\x41\x42\x43\x44\x45\x46\x61\x62\x63\x64" \
|
|
64
38
|
"\x65\x66"
|
|
65
39
|
JA_SPACE = "\xa1\xa1"
|
|
@@ -266,4 +240,5 @@ module CharString
|
|
|
266
240
|
CharString.register_encoding(self)
|
|
267
241
|
|
|
268
242
|
end # module EUCJP
|
|
269
|
-
end
|
|
243
|
+
end # module CharString
|
|
244
|
+
end # class DocDiff
|