docdiff 0.5.0 → 0.6.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.travis.yml +5 -3
- data/Gemfile +1 -1
- data/Makefile +15 -19
- data/Rakefile +45 -10
- data/bin/docdiff +9 -11
- data/devutil/Rakefile +9 -0
- data/devutil/changelog.sh +40 -0
- data/docdiff.gemspec +4 -4
- data/docdiffwebui.cgi +1 -1
- data/langfilter.rb +1 -5
- data/lib/doc_diff.rb +5 -1
- data/lib/docdiff.rb +1 -1
- data/lib/docdiff/charstring.rb +6 -282
- data/lib/docdiff/diff.rb +2 -0
- data/lib/docdiff/diff/contours.rb +2 -1
- data/lib/docdiff/diff/editscript.rb +2 -0
- data/lib/docdiff/diff/rcsdiff.rb +2 -0
- data/lib/docdiff/diff/shortestpath.rb +2 -0
- data/lib/docdiff/diff/speculative.rb +6 -3
- data/lib/docdiff/diff/subsequence.rb +2 -0
- data/lib/docdiff/diff/unidiff.rb +2 -0
- data/lib/docdiff/difference.rb +2 -0
- data/lib/docdiff/document.rb +2 -0
- data/lib/docdiff/encoding/en_ascii.rb +3 -1
- data/lib/docdiff/encoding/ja_eucjp.rb +3 -1
- data/lib/docdiff/encoding/ja_sjis.rb +3 -1
- data/lib/docdiff/encoding/ja_utf8.rb +3 -1
- data/lib/docdiff/version.rb +1 -1
- data/lib/docdiff/view.rb +4 -10
- data/lib/viewdiff.rb +9 -5
- data/readme.html +23 -3
- data/readme.md +184 -0
- data/test/charstring_test.rb +13 -26
- data/test/diff_test.rb +2 -1
- data/test/difference_test.rb +2 -1
- data/test/docdiff_test.rb +9 -2
- data/test/document_test.rb +4 -6
- data/test/view_test.rb +3 -1
- data/test/viewdiff_test.rb +14 -11
- metadata +23 -29
- data/devutil/JIS0208.TXT +0 -6952
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: f8733587f13d662ca5c5fda60843298ef71ef798284d9f677d03378d8b5e0e29
|
4
|
+
data.tar.gz: f92804fdb17576aaded799010553a0d6685dbb8679e53cbf6cf15c4773af8ddf
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 94af3213ec734c2b80ad72f70bb1d1312d14da7d15e77237f1824ade5d83f39d666bf8049dcb4c6a2898b402ab80b64c22184bb93359f23a8496f83d0e191f0b
|
7
|
+
data.tar.gz: dfd7a6f65ff88a556b5ccb5612daac005b3405cb0f36f64cc5f4914a061bdbd20c4ac4ab5610c657b9036f09485a18e912e527bc5c78b276d2a494bfc977455a
|
data/.travis.yml
CHANGED
data/Gemfile
CHANGED
data/Makefile
CHANGED
@@ -1,19 +1,21 @@
|
|
1
|
+
# Warning: this Makefile is obsolete, use Rakefile instead
|
2
|
+
|
1
3
|
PRODUCT = docdiff
|
2
|
-
VERSION =
|
4
|
+
VERSION = $(shell $(RUBY) -r./lib/docdiff/version.rb -e 'Docdiff::VERSION.display')
|
3
5
|
RUBY = ruby
|
4
6
|
TAR_XVCS = tar --exclude=.svn --exclude=.git
|
5
7
|
|
6
8
|
DOCS = ChangeLog readme.en.html readme.ja.html \
|
7
9
|
index.en.html index.ja.html
|
8
10
|
DOCSRC = readme.html index.html img sample
|
9
|
-
TESTS =
|
10
|
-
|
11
|
-
DIST = Makefile devutil docdiff docdiff.conf.example docdiff.rb \
|
11
|
+
TESTS = test/*_test.rb
|
12
|
+
DIST = Makefile devutil lib docdiff.conf.example bin/docdiff \
|
12
13
|
docdiff.gemspec \
|
13
14
|
docdiffwebui.html docdiffwebui.cgi \
|
14
15
|
$(DOCSRC) $(DOCS) $(TESTS)
|
15
|
-
TESTLOGS =
|
16
|
-
|
16
|
+
TESTLOGS = $(foreach t,\
|
17
|
+
$(wildcard test/*_test.rb),\
|
18
|
+
$(t:test/%_test.rb=%_test.log)) \
|
17
19
|
|
18
20
|
WWWUSER = hisashim,docdiff
|
19
21
|
WWWSITE = web.sourceforge.net
|
@@ -28,22 +30,16 @@ all: $(DOCS)
|
|
28
30
|
|
29
31
|
testall:
|
30
32
|
$(MAKE) test RUBY=ruby1.9.1
|
31
|
-
$(MAKE) test RUBY=ruby1.8
|
32
33
|
|
33
34
|
test: $(TESTLOGS)
|
34
35
|
|
35
|
-
|
36
|
-
$(RUBY) -I
|
36
|
+
%_test.log:
|
37
|
+
$(RUBY) -I./lib test/$*_test.rb | tee $@
|
37
38
|
|
38
39
|
docs: $(DOCS)
|
39
40
|
|
40
41
|
ChangeLog:
|
41
|
-
|
42
|
-
if [ -d .svn ] ; then \
|
43
|
-
svn log -rHEAD:0 -v > ChangeLog ; \
|
44
|
-
else \
|
45
|
-
git svn log > ChangeLog ; \
|
46
|
-
fi
|
42
|
+
devutil/changelog.sh > $@
|
47
43
|
|
48
44
|
readme.%.html: readme.html
|
49
45
|
$(RUBY) -Ku langfilter.rb --$* $< > $@
|
@@ -54,13 +50,13 @@ install: $(DIST)
|
|
54
50
|
@if [ ! -d $(DESTDIR)$(PREFIX)/bin ]; then \
|
55
51
|
mkdir -p $(DESTDIR)$(PREFIX)/bin; \
|
56
52
|
fi
|
57
|
-
cp -Ppv docdiff
|
53
|
+
cp -Ppv bin/docdiff $(DESTDIR)$(PREFIX)/bin/
|
58
54
|
chmod +x $(DESTDIR)$(PREFIX)/bin/docdiff
|
59
55
|
|
60
|
-
@if [ ! -d $(datadir)
|
61
|
-
mkdir -p $(datadir)
|
56
|
+
@if [ ! -d $(datadir)/$(PRODUCT) ]; then \
|
57
|
+
mkdir -p $(datadir)/$(PRODUCT); \
|
62
58
|
fi
|
63
|
-
($(TAR_XVCS) -cf -
|
59
|
+
(cd lib && $(TAR_XVCS) -cf - *) | (cd $(datadir)/$(PRODUCT) && tar -xpf -)
|
64
60
|
|
65
61
|
@if [ ! -d $(DESTDIR)/etc/$(PRODUCT) ]; then \
|
66
62
|
mkdir -p $(DESTDIR)/etc/$(PRODUCT); \
|
data/Rakefile
CHANGED
@@ -1,17 +1,52 @@
|
|
1
|
+
require 'rake/clean'
|
2
|
+
require 'rake/testtask'
|
1
3
|
require 'bundler/gem_tasks'
|
2
4
|
|
3
|
-
|
5
|
+
RUBY = ENV['RUBY'] ||= 'ruby'
|
6
|
+
DOCS = FileList['ChangeLog', 'readme.en.html', 'readme.ja.html',
|
7
|
+
'index.en.html', 'index.ja.html']
|
8
|
+
DOCSRC = FileList['readme.html', 'index.html', 'img', 'sample']
|
9
|
+
TESTS = FileList['test/*_test.rb']
|
10
|
+
TESTLOGS = Dir.glob('test/*_test.rb').map{|f|
|
11
|
+
File.basename(f).ext('log')
|
12
|
+
}
|
13
|
+
|
14
|
+
WWWUSER = ENV['WWWUSER'] ||= 'hisashim,docdiff'
|
15
|
+
WWWSITE = ENV['WWWSITE'] ||= 'web.sourceforge.net'
|
16
|
+
WWWSITEPATH = ENV['WWWSITEPATH'] ||= 'htdocs/'
|
17
|
+
WWWDRYRUN = ENV['WWWDRYRUN'] ||= '--dry-run'
|
18
|
+
|
4
19
|
Rake::TestTask.new do |t|
|
5
|
-
t.test_files =
|
20
|
+
t.test_files = TESTS
|
6
21
|
t.verbose = true
|
7
22
|
end
|
8
23
|
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
end
|
24
|
+
task :default => :test
|
25
|
+
|
26
|
+
desc "generate documents"
|
27
|
+
task :docs => DOCS
|
28
|
+
|
29
|
+
file 'ChangeLog' do |t|
|
30
|
+
sh "devutil/changelog.sh > #{t.name}"
|
17
31
|
end
|
32
|
+
|
33
|
+
rule(/.*\.(?:en|ja)\.html/ => proc{|tn| tn.gsub(/\.(?:en|ja)/, '')}) do |t|
|
34
|
+
sh "#{RUBY} -E UTF-8 langfilter.rb" +
|
35
|
+
" --#{t.name.gsub(/.*?\.(en|ja)\.html/){$1}}" +
|
36
|
+
" #{t.prerequisites.first} > #{t.name}"
|
37
|
+
end
|
38
|
+
|
39
|
+
desc "force to rsync web contents"
|
40
|
+
task :wwwupload do |t|
|
41
|
+
sh "rake www WWWDRYRUN="
|
42
|
+
end
|
43
|
+
|
44
|
+
desc "rsync web contents"
|
45
|
+
task :www => DOCSRC + DOCS do |t|
|
46
|
+
sh "rsync #{WWWDRYRUN} -auv -e ssh --delete" +
|
47
|
+
" --exclude='.svn' --exclude='.git'" +
|
48
|
+
t.prerequisites.join(' ') +
|
49
|
+
" #{WWWUSER}@#{WWWSITE}:#{WWWSITEPATH}"
|
50
|
+
end
|
51
|
+
|
52
|
+
CLEAN.include(DOCS, TESTLOGS)
|
data/bin/docdiff
CHANGED
@@ -1,11 +1,8 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
# DocDiff: word/character-oriented text comparison utility
|
3
3
|
# Copyright (C) 2002-2011 Hisashi MORITA
|
4
|
-
# Requirements: Ruby (>=
|
4
|
+
# Requirements: Ruby (>= 2.0)
|
5
5
|
require 'docdiff'
|
6
|
-
require 'docdiff/difference'
|
7
|
-
require 'docdiff/document'
|
8
|
-
require 'docdiff/view'
|
9
6
|
require 'optparse'
|
10
7
|
|
11
8
|
# do_config_stuff
|
@@ -40,7 +37,7 @@ ARGV.options {|o|
|
|
40
37
|
o.def_option('--char', 'set resolution to char'){clo[:resolution] = "char"}
|
41
38
|
|
42
39
|
o.def_option('--encoding=ENCODING',
|
43
|
-
possible_encodings = ['ASCII','EUC-JP','Shift_JIS','UTF-8','auto'],
|
40
|
+
possible_encodings = ['ASCII','EUC-JP','Shift_JIS','CP932','UTF-8','auto'],
|
44
41
|
'specify character encoding',
|
45
42
|
possible_encodings.join('|'), "(default is auto. try ASCII for single byte encodings such as ISO-8859-X)"
|
46
43
|
){|s| clo[:encoding] = (s || "auto")}
|
@@ -48,6 +45,7 @@ ARGV.options {|o|
|
|
48
45
|
o.def_option('--iso8859x', 'same as --encoding=ASCII'){clo[:encoding] = "ASCII"}
|
49
46
|
o.def_option('--eucjp', 'same as --encoding=EUC-JP'){clo[:encoding] = "EUC-JP"}
|
50
47
|
o.def_option('--sjis', 'same as --encoding=Shift_JIS'){clo[:encoding] = "Shift_JIS"}
|
48
|
+
o.def_option('--cp932', 'same as --encoding=CP932'){clo[:encoding] = "CP932"}
|
51
49
|
o.def_option('--utf8', 'same as --encoding=UTF-8'){clo[:encoding] = "UTF-8"}
|
52
50
|
|
53
51
|
o.def_option('--eol=EOL',
|
@@ -145,8 +143,8 @@ eol1 = docdiff.config[:eol]
|
|
145
143
|
eol2 = docdiff.config[:eol]
|
146
144
|
|
147
145
|
if docdiff.config[:encoding] == "auto"
|
148
|
-
encoding1 = CharString.guess_encoding(file1_content)
|
149
|
-
encoding2 = CharString.guess_encoding(file2_content)
|
146
|
+
encoding1 = DocDiff::CharString.guess_encoding(file1_content)
|
147
|
+
encoding2 = DocDiff::CharString.guess_encoding(file2_content)
|
150
148
|
case
|
151
149
|
when (encoding1 == "UNKNOWN" or encoding2 == "UNKNOWN")
|
152
150
|
raise "Document encoding unknown (#{encoding1}, #{encoding2})."
|
@@ -156,8 +154,8 @@ if docdiff.config[:encoding] == "auto"
|
|
156
154
|
end
|
157
155
|
|
158
156
|
if docdiff.config[:eol] == "auto"
|
159
|
-
eol1 = CharString.guess_eol(file1_content)
|
160
|
-
eol2 = CharString.guess_eol(file2_content)
|
157
|
+
eol1 = DocDiff::CharString.guess_eol(file1_content)
|
158
|
+
eol2 = DocDiff::CharString.guess_eol(file2_content)
|
161
159
|
case
|
162
160
|
when (eol1.nil? or eol2.nil?)
|
163
161
|
raise "Document eol is nil (#{eol1.inspect}, #{eol2.inspect}). The document might be empty."
|
@@ -168,8 +166,8 @@ if docdiff.config[:eol] == "auto"
|
|
168
166
|
end
|
169
167
|
end
|
170
168
|
|
171
|
-
doc1 = Document.new(file1_content, encoding1, eol1)
|
172
|
-
doc2 = Document.new(file2_content, encoding2, eol2)
|
169
|
+
doc1 = DocDiff::Document.new(file1_content, encoding1, eol1)
|
170
|
+
doc2 = DocDiff::Document.new(file2_content, encoding2, eol2)
|
173
171
|
|
174
172
|
output = docdiff.run(doc1, doc2,
|
175
173
|
{:resolution => docdiff.config[:resolution],
|
data/devutil/Rakefile
ADDED
@@ -0,0 +1,40 @@
|
|
1
|
+
#!/bin/sh
|
2
|
+
# ChangeLog Generator
|
3
|
+
# Copyright 2011 Hisashi Morita
|
4
|
+
# License: Public Domain
|
5
|
+
#
|
6
|
+
# Usage:
|
7
|
+
# changelog.sh [WORKING_DIR] > ChangeLog
|
8
|
+
|
9
|
+
if [ "$1" ]; then
|
10
|
+
WD="$1"
|
11
|
+
else
|
12
|
+
WD="."
|
13
|
+
fi
|
14
|
+
|
15
|
+
# Subversion
|
16
|
+
which svn >/dev/null
|
17
|
+
if [ x"$?" = x0 ]; then
|
18
|
+
(svn info "${WD}" >/dev/null 2>&1) && SVN=TRUE
|
19
|
+
if [ x"${SVN}" = xTRUE ]; then
|
20
|
+
(cd "${WD}"; svn log -rBASE:0 -v)
|
21
|
+
fi
|
22
|
+
fi
|
23
|
+
|
24
|
+
# Git
|
25
|
+
which git >/dev/null
|
26
|
+
if [ x"$?" = x0 ]; then
|
27
|
+
(cd "${WD}" && git status --porcelain >/dev/null 2>&1) && GIT=TRUE
|
28
|
+
if [ x"${GIT}" = xTRUE ]; then
|
29
|
+
(cd "${WD}"; git log | cat)
|
30
|
+
fi
|
31
|
+
fi
|
32
|
+
|
33
|
+
# Mercurial
|
34
|
+
which hg >/dev/null
|
35
|
+
if [ x"$?" = x0 ]; then
|
36
|
+
(hg status "${WD}" >/dev/null 2>&1) && HG=TRUE
|
37
|
+
if [ x"${HG}" = xTRUE ]; then
|
38
|
+
(cd "${WD}"; hg log --rev tip:0)
|
39
|
+
fi
|
40
|
+
fi
|
data/docdiff.gemspec
CHANGED
@@ -1,20 +1,20 @@
|
|
1
1
|
# -*- encoding: utf-8 -*-
|
2
|
-
$:.
|
2
|
+
$:.unshift File.expand_path("../lib", __FILE__)
|
3
3
|
require "docdiff/version"
|
4
4
|
|
5
5
|
Gem::Specification.new do |s|
|
6
6
|
s.name = "docdiff"
|
7
7
|
s.version = Docdiff::VERSION
|
8
|
+
s.license = "BSD-3-Clause"
|
8
9
|
s.authors = ["Hisashi Morita"]
|
9
|
-
s.email = ["hisashim at
|
10
|
-
s.homepage = "
|
10
|
+
s.email = ["hisashim at workbook.org"]
|
11
|
+
s.homepage = "https://github.com/hisashim/docdiff"
|
11
12
|
s.summary = %q{Word-by-word diff}
|
12
13
|
s.description = %q{DocDiff compares two text files and shows the
|
13
14
|
difference. It can compare files word by word,
|
14
15
|
character by character, or line by line. It has
|
15
16
|
several output formats such as HTML, tty, Manued,
|
16
17
|
or user-defined markup.}
|
17
|
-
s.rubyforge_project = "docdiff"
|
18
18
|
|
19
19
|
s.files = `git ls-files`.split("\n")
|
20
20
|
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
data/docdiffwebui.cgi
CHANGED
data/langfilter.rb
CHANGED
@@ -2,13 +2,9 @@
|
|
2
2
|
# language filter
|
3
3
|
# usage: langfilter.rb --en <infile >outfile
|
4
4
|
|
5
|
-
def ruby_m17n?
|
6
|
-
return true if "".respond_to? :encoding
|
7
|
-
end
|
8
|
-
|
9
5
|
lang_to_include = ARGV.shift.gsub(/-+/, "")
|
10
6
|
lang_to_exclude = {"en"=>"ja", "ja"=>"en"}[lang_to_include]
|
11
7
|
re = /<([a-z]+) +(?:(?:lang|title)="#{lang_to_exclude}").*?>.*?<\/\1>[\r\n]?/m
|
12
8
|
|
13
|
-
ARGF.set_encoding("UTF-8")
|
9
|
+
ARGF.set_encoding("UTF-8")
|
14
10
|
ARGF.read.gsub(re, "").display
|
data/lib/doc_diff.rb
CHANGED
@@ -1,6 +1,10 @@
|
|
1
1
|
# DocDiff: word/character-oriented text comparison utility
|
2
2
|
# Copyright (C) 2002-2011 Hisashi MORITA
|
3
|
-
# Requirements: Ruby (>=
|
3
|
+
# Requirements: Ruby (>= 2.0)
|
4
|
+
require 'docdiff/difference'
|
5
|
+
require 'docdiff/document'
|
6
|
+
require 'docdiff/view'
|
7
|
+
|
4
8
|
class DocDiff
|
5
9
|
|
6
10
|
AppVersion = Docdiff::VERSION
|
data/lib/docdiff.rb
CHANGED
data/lib/docdiff/charstring.rb
CHANGED
@@ -3,6 +3,7 @@
|
|
3
3
|
# To use, include to String, or extend String.
|
4
4
|
# 2003- Hisashi MORITA
|
5
5
|
|
6
|
+
class DocDiff
|
6
7
|
module CharString
|
7
8
|
|
8
9
|
Encodings = {}
|
@@ -72,9 +73,10 @@ module CharString
|
|
72
73
|
# returns 'CR', 'LF', 'CRLF', 'UNKNOWN'(binary),
|
73
74
|
# 'NONE'(1-line), or nil
|
74
75
|
return nil if string == nil #=> nil (argument missing)
|
75
|
-
|
76
|
-
|
77
|
-
'
|
76
|
+
bin_string = string.dup.force_encoding("ASCII-8BIT")
|
77
|
+
eol_counts = {'CR' => bin_string.scan(/(\r)(?!\n)/o).size,
|
78
|
+
'LF' => bin_string.scan(/(?:\A|[^\r])(\n)/o).size,
|
79
|
+
'CRLF' => bin_string.scan(/(\r\n)/o).size}
|
78
80
|
eol_counts.delete_if{|eol, count| count == 0} # Remove missing EOL
|
79
81
|
eols = eol_counts.keys
|
80
82
|
eol_variety = eols.size # numbers of flavors found
|
@@ -87,10 +89,6 @@ module CharString
|
|
87
89
|
end
|
88
90
|
end
|
89
91
|
|
90
|
-
def CharString.ruby_m17n?
|
91
|
-
"".respond_to?(:force_encoding)
|
92
|
-
end
|
93
|
-
|
94
92
|
# Note that some languages (like Japanese) do not have 'word' or 'phrase',
|
95
93
|
# thus some of the following methods are not 'linguistically correct'.
|
96
94
|
|
@@ -128,7 +126,6 @@ module CharString
|
|
128
126
|
}.compact.size
|
129
127
|
end
|
130
128
|
|
131
|
-
if ruby_m17n?
|
132
129
|
# for Ruby-1.9
|
133
130
|
def encoding()
|
134
131
|
String.new(self).encoding.to_s
|
@@ -254,280 +251,6 @@ if ruby_m17n?
|
|
254
251
|
require 'docdiff/encoding/ja_eucjp'
|
255
252
|
require 'docdiff/encoding/ja_sjis'
|
256
253
|
require 'docdiff/encoding/ja_utf8'
|
257
|
-
else
|
258
|
-
# for Ruby-1.8
|
259
|
-
require 'iconv'
|
260
|
-
|
261
|
-
def encoding()
|
262
|
-
@encoding
|
263
|
-
# if @encoding
|
264
|
-
# @encoding
|
265
|
-
# else
|
266
|
-
# @encoding = CharString.guess_encoding(self)
|
267
|
-
# # raise "encoding is not set.\n"
|
268
|
-
# end
|
269
|
-
end
|
270
|
-
|
271
|
-
def encoding=(cs)
|
272
|
-
@encoding = cs
|
273
|
-
extend Encodings[@encoding] # ; p "Hey, I extended #{Encodings[@encoding]}!"
|
274
|
-
end
|
275
|
-
|
276
|
-
# returns nil, 'US-ASCII', 'JIS', 'EUC-JP', 'Shift_JIS', 'UTF-8', or 'UNKNOWN'
|
277
|
-
def CharString.guess_encoding(string)
|
278
|
-
return nil if string == nil
|
279
|
-
result_using_pureruby = CharString.guess_encoding_using_pureruby(string)
|
280
|
-
result_using_iconv = CharString.guess_encoding_using_iconv(string)
|
281
|
-
if result_using_pureruby == result_using_iconv
|
282
|
-
result_using_pureruby
|
283
|
-
else
|
284
|
-
"UNKNOWN"
|
285
|
-
end
|
286
|
-
end
|
287
|
-
|
288
|
-
# returns nil, 'US-ASCII', 'JIS', 'EUC-JP', 'Shift_JIS', 'UTF-8', or 'UNKNOWN'
|
289
|
-
def CharString.guess_encoding_using_pureruby(string)
|
290
|
-
return nil if string == nil
|
291
|
-
|
292
|
-
ascii_pat = '[\x00-\x7f]'
|
293
|
-
jis_pat = ['(?:(?:\x1b\x28\x42)',
|
294
|
-
'|(?:\x1b\x28\x4a)',
|
295
|
-
'|(?:\x1b\x28\x49)',
|
296
|
-
'|(?:\x1b\x24\x40)',
|
297
|
-
'|(?:\x1b\x24\x42)',
|
298
|
-
'|(?:\x1b\x24\x44))'].join
|
299
|
-
eucjp_pat = ['(?:(?:[\x00-\x1f\x7f])',
|
300
|
-
'|(?:[\x20-\x7e])',
|
301
|
-
'|(?:\x8e[\xa1-\xdf])',
|
302
|
-
'|(?:[\xa1-\xfe][\xa1-\xfe])',
|
303
|
-
'|(?:\x8f[\xa1-\xfe][\xa1-\xfe]))'].join
|
304
|
-
sjis_pat = ['(?:(?:[\x00-\x1f\x7f])',
|
305
|
-
'|(?:[\x20-\x7e])',
|
306
|
-
'|(?:[\xa1-\xdf])',
|
307
|
-
'|(?:[\x81-\x9f][\x40-\x7e])',
|
308
|
-
'|(?:[\xe0-\xef][\x80-\xfc]))'].join
|
309
|
-
utf8_pat = ['(?:(?:[\x00-\x7f])',
|
310
|
-
'|(?:[\xc0-\xdf][\x80-\xbf])',
|
311
|
-
'|(?:[\xe0-\xef][\x80-\xbf][\x80-\xbf])',
|
312
|
-
'|(?:[\xf0-\xf7][\x80-\xbf][\x80-\xbf][\x80-\xbf]))'].join
|
313
|
-
|
314
|
-
ascii_match_length = string.scan(/#{ascii_pat}/on).join.length
|
315
|
-
jis_escseq_count = string.scan(/#{jis_pat}/on).size
|
316
|
-
eucjp_match_length = string.scan(/#{eucjp_pat}/no).join.length
|
317
|
-
sjis_match_length = string.scan(/#{sjis_pat}/no).join.length
|
318
|
-
utf8_match_length = string.scan(/#{utf8_pat}/no).join.length
|
319
|
-
|
320
|
-
case
|
321
|
-
when 0 < jis_escseq_count # JIS escape sequense found
|
322
|
-
guessed_encoding = 'JIS'
|
323
|
-
when ascii_match_length == string.length # every char is ASCII (but not JIS)
|
324
|
-
guessed_encoding = 'US-ASCII'
|
325
|
-
else
|
326
|
-
case
|
327
|
-
when eucjp_match_length < (string.length / 2) &&
|
328
|
-
sjis_match_length < (string.length / 2) &&
|
329
|
-
utf8_match_length < (string.length / 2)
|
330
|
-
guessed_encoding = 'UNKNOWN' # either encoding did not match long enough
|
331
|
-
when (eucjp_match_length < utf8_match_length) &&
|
332
|
-
(sjis_match_length < utf8_match_length)
|
333
|
-
guessed_encoding = 'UTF-8'
|
334
|
-
when (eucjp_match_length < sjis_match_length) &&
|
335
|
-
(utf8_match_length < sjis_match_length)
|
336
|
-
guessed_encoding = 'Shift_JIS'
|
337
|
-
when (sjis_match_length < eucjp_match_length) &&
|
338
|
-
(utf8_match_length < eucjp_match_length)
|
339
|
-
guessed_encoding = 'EUC-JP'
|
340
|
-
else
|
341
|
-
guessed_encoding = 'UNKNOWN' # cannot guess at all
|
342
|
-
end
|
343
|
-
end
|
344
|
-
return guessed_encoding
|
345
|
-
end
|
346
|
-
|
347
|
-
def CharString.guess_encoding_using_iconv(string)
|
348
|
-
valid_as_utf8 = CharString.valid_as("utf-8", string)
|
349
|
-
valid_as_sjis = CharString.valid_as("cp932", string) # not sjis, but cp932
|
350
|
-
valid_as_jis = CharString.valid_as("iso-2022-jp", string)
|
351
|
-
valid_as_eucjp = CharString.valid_as("eucjp", string)
|
352
|
-
valid_as_ascii = CharString.valid_as("ascii", string)
|
353
|
-
invalid_as_utf8 = CharString.invalid_as("utf-8", string)
|
354
|
-
invalid_as_sjis = CharString.invalid_as("cp932", string) # not sjis, but cp932
|
355
|
-
invalid_as_jis = CharString.invalid_as("iso-2022-jp", string)
|
356
|
-
invalid_as_eucjp = CharString.invalid_as("eucjp", string)
|
357
|
-
invalid_as_ascii = CharString.invalid_as("ascii", string)
|
358
|
-
case
|
359
|
-
when string == nil
|
360
|
-
nil
|
361
|
-
when valid_as_ascii
|
362
|
-
"US-ASCII"
|
363
|
-
when valid_as_jis # Iconv sometimes recognizes JIS for ASCII, ignoring JIS escape sequence.
|
364
|
-
"JIS"
|
365
|
-
when valid_as_eucjp
|
366
|
-
"EUC-JP"
|
367
|
-
when valid_as_sjis && invalid_as_utf8 && invalid_as_eucjp && invalid_as_jis
|
368
|
-
"Shift_JIS"
|
369
|
-
when valid_as_utf8 && invalid_as_sjis && invalid_as_eucjp && invalid_as_jis
|
370
|
-
"UTF-8"
|
371
|
-
else
|
372
|
-
"UNKNOWN"
|
373
|
-
end
|
374
|
-
end
|
375
|
-
|
376
|
-
def CharString.valid_as(encoding_name, string)
|
377
|
-
begin
|
378
|
-
Iconv.iconv(encoding_name, encoding_name, string)
|
379
|
-
rescue Iconv::IllegalSequence, Iconv::InvalidCharacter, Iconv::OutOfRange
|
380
|
-
return false
|
381
|
-
else
|
382
|
-
return true
|
383
|
-
end
|
384
|
-
end
|
385
|
-
|
386
|
-
def CharString.invalid_as(encoding_name, string)
|
387
|
-
if CharString.valid_as(encoding_name, string)
|
388
|
-
false
|
389
|
-
else
|
390
|
-
true
|
391
|
-
end
|
392
|
-
end
|
393
|
-
|
394
|
-
def split_to_byte()
|
395
|
-
scan(/./nm)
|
396
|
-
end
|
397
|
-
|
398
|
-
def split_to_char()
|
399
|
-
raise "Encodings[encoding] is #{Encodings[encoding].inspect}: encoding not specified or auto-detection failed." unless Encodings[encoding]
|
400
|
-
# raise "EOLChars[eol] is #{EOLChars[eol].inspect}: eol not specified or auto-detection failed." unless EOLChars[eol]
|
401
|
-
if eol_char # sometimes string has no end-of-line char
|
402
|
-
scan(Regexp.new("(?:#{eol_char})|(?:.)",
|
403
|
-
Regexp::MULTILINE,
|
404
|
-
encoding.sub(/ASCII/i, 'none'))
|
405
|
-
)
|
406
|
-
else # it seems that no EOL module was extended...
|
407
|
-
scan(Regexp.new("(?:.)",
|
408
|
-
Regexp::MULTILINE,
|
409
|
-
encoding.sub(/ASCII/i, 'none'))
|
410
|
-
)
|
411
|
-
end
|
412
|
-
end
|
413
|
-
|
414
|
-
def count_latin_graph_char()
|
415
|
-
raise "Encodings[encoding] is #{Encodings[encoding].inspect}: encoding not specified or auto-detection failed." unless Encodings[encoding]
|
416
|
-
# raise "EOLChars[eol] is #{EOLChars[eol].inspect}: eol not specified or auto-detection failed." unless EOLChars[eol]
|
417
|
-
scan(Regexp.new("[#{Encodings[encoding]::GRAPH}]",
|
418
|
-
Regexp::MULTILINE,
|
419
|
-
encoding.sub(/ASCII/i, 'none'))
|
420
|
-
).size
|
421
|
-
end
|
422
|
-
|
423
|
-
def count_ja_graph_char()
|
424
|
-
raise "Encodings[encoding] is #{Encodings[encoding].inspect}: encoding not specified or auto-detection failed." unless Encodings[encoding]
|
425
|
-
# raise "EOLChars[eol] is #{EOLChars[eol].inspect}: eol not specified or auto-detection failed." unless EOLChars[eol]
|
426
|
-
scan(Regexp.new("[#{Encodings[encoding]::JA_GRAPH}]",
|
427
|
-
Regexp::MULTILINE,
|
428
|
-
encoding.sub(/ASCII/i, 'none'))
|
429
|
-
).size
|
430
|
-
end
|
431
|
-
|
432
|
-
def count_latin_blank_char()
|
433
|
-
scan(Regexp.new("[#{Encodings[encoding]::BLANK}]",
|
434
|
-
Regexp::MULTILINE,
|
435
|
-
encoding.sub(/ASCII/i, 'none'))
|
436
|
-
).size
|
437
|
-
end
|
438
|
-
|
439
|
-
def count_ja_blank_char()
|
440
|
-
scan(Regexp.new("[#{Encodings[encoding]::JA_BLANK}]",
|
441
|
-
Regexp::MULTILINE,
|
442
|
-
encoding.sub(/ASCII/i, 'none'))
|
443
|
-
).size
|
444
|
-
end
|
445
|
-
|
446
|
-
def split_to_word()
|
447
|
-
raise "Encodings[encoding] is #{Encodings[encoding].inspect}: encoding not specified or auto-detection failed." unless Encodings[encoding]
|
448
|
-
# raise "EOLChars[eol] is #{EOLChars[eol].inspect}: eol not specified or auto-detection failed." unless EOLChars[eol]
|
449
|
-
scan(Regexp.new(Encodings[encoding]::WORD_REGEXP_SRC,
|
450
|
-
Regexp::MULTILINE,
|
451
|
-
encoding.sub(/ASCII/i, 'none'))
|
452
|
-
)
|
453
|
-
end
|
454
|
-
|
455
|
-
def count_latin_word()
|
456
|
-
split_to_word.collect{|word|
|
457
|
-
word if Regexp.new("[#{Encodings[encoding]::PRINT}]",
|
458
|
-
Regexp::MULTILINE,
|
459
|
-
encoding.sub(/ASCII/i, 'none')).match word
|
460
|
-
}.compact.size
|
461
|
-
end
|
462
|
-
|
463
|
-
def count_ja_word()
|
464
|
-
split_to_word.collect{|word|
|
465
|
-
word if Regexp.new("[#{Encodings[encoding]::JA_PRINT}]",
|
466
|
-
Regexp::MULTILINE,
|
467
|
-
encoding.sub(/ASCII/i, 'none')).match word
|
468
|
-
}.compact.size
|
469
|
-
end
|
470
|
-
|
471
|
-
def count_latin_valid_word()
|
472
|
-
split_to_word.collect{|word|
|
473
|
-
word if Regexp.new("[#{Encodings[encoding]::ALNUM}]",
|
474
|
-
Regexp::MULTILINE,
|
475
|
-
encoding.sub(/ASCII/i, 'none')).match word
|
476
|
-
}.compact.size
|
477
|
-
end
|
478
|
-
|
479
|
-
def count_ja_valid_word()
|
480
|
-
split_to_word.collect{|word|
|
481
|
-
word if Regexp.new("[#{Encodings[encoding]::JA_GRAPH}]",
|
482
|
-
Regexp::MULTILINE,
|
483
|
-
encoding.sub(/ASCII/i, 'none')).match word
|
484
|
-
}.compact.size
|
485
|
-
end
|
486
|
-
|
487
|
-
def split_to_line()
|
488
|
-
# scan(Regexp.new(".*?#{eol_char}|.+",
|
489
|
-
# Regexp::MULTILINE,
|
490
|
-
# encoding.sub(/ASCII/i, 'none'))
|
491
|
-
# )
|
492
|
-
raise "Encodings[encoding] is #{Encodings[encoding].inspect}: encoding not specified or auto-detection failed." unless Encodings[encoding]
|
493
|
-
raise "EOLChars[eol] is #{EOLChars[eol].inspect}: eol not specified or auto-detection failed." unless EOLChars[eol]
|
494
|
-
if defined? eol_char
|
495
|
-
scan(Regexp.new(".*?#{eol_char}|.+",
|
496
|
-
Regexp::MULTILINE,
|
497
|
-
encoding.sub(/ASCII/i, 'none'))
|
498
|
-
)
|
499
|
-
else
|
500
|
-
scan(Regexp.new(".+",
|
501
|
-
Regexp::MULTILINE,
|
502
|
-
encoding.sub(/ASCII/i, 'none'))
|
503
|
-
)
|
504
|
-
end
|
505
|
-
end
|
506
|
-
|
507
|
-
def count_graph_line()
|
508
|
-
split_to_line.collect{|line|
|
509
|
-
line if Regexp.new("[#{Encodings[encoding]::GRAPH}" +
|
510
|
-
"#{Encodings[encoding]::JA_GRAPH}]",
|
511
|
-
Regexp::MULTILINE,
|
512
|
-
encoding.sub(/ASCII/, 'none')).match line
|
513
|
-
}.compact.size
|
514
|
-
end
|
515
|
-
|
516
|
-
def count_blank_line()
|
517
|
-
split_to_line.collect{|line|
|
518
|
-
line if Regexp.new("^[#{Encodings[encoding]::BLANK}" +
|
519
|
-
"#{Encodings[encoding]::JA_BLANK}]+(?:#{eol_char})?",
|
520
|
-
Regexp::MULTILINE,
|
521
|
-
encoding.sub(/ASCII/, 'none')).match line
|
522
|
-
}.compact.size
|
523
|
-
end
|
524
|
-
|
525
|
-
# load encoding modules
|
526
|
-
require 'docdiff/encoding/en_ascii'
|
527
|
-
require 'docdiff/encoding/ja_eucjp'
|
528
|
-
require 'docdiff/encoding/ja_sjis'
|
529
|
-
require 'docdiff/encoding/ja_utf8'
|
530
|
-
end # end ruby_m17n?
|
531
254
|
alias to_bytes split_to_byte
|
532
255
|
alias to_chars split_to_char
|
533
256
|
alias to_words split_to_word
|
@@ -573,6 +296,7 @@ end # end ruby_m17n?
|
|
573
296
|
end
|
574
297
|
|
575
298
|
end # module CharString
|
299
|
+
end # class DocDiff
|
576
300
|
|
577
301
|
# class String
|
578
302
|
# include CharString
|