docdiff 0.5.0 → 0.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.travis.yml +5 -3
- data/Gemfile +1 -1
- data/Makefile +15 -19
- data/Rakefile +45 -10
- data/bin/docdiff +9 -11
- data/devutil/Rakefile +9 -0
- data/devutil/changelog.sh +40 -0
- data/docdiff.gemspec +4 -4
- data/docdiffwebui.cgi +1 -1
- data/langfilter.rb +1 -5
- data/lib/doc_diff.rb +5 -1
- data/lib/docdiff.rb +1 -1
- data/lib/docdiff/charstring.rb +6 -282
- data/lib/docdiff/diff.rb +2 -0
- data/lib/docdiff/diff/contours.rb +2 -1
- data/lib/docdiff/diff/editscript.rb +2 -0
- data/lib/docdiff/diff/rcsdiff.rb +2 -0
- data/lib/docdiff/diff/shortestpath.rb +2 -0
- data/lib/docdiff/diff/speculative.rb +6 -3
- data/lib/docdiff/diff/subsequence.rb +2 -0
- data/lib/docdiff/diff/unidiff.rb +2 -0
- data/lib/docdiff/difference.rb +2 -0
- data/lib/docdiff/document.rb +2 -0
- data/lib/docdiff/encoding/en_ascii.rb +3 -1
- data/lib/docdiff/encoding/ja_eucjp.rb +3 -1
- data/lib/docdiff/encoding/ja_sjis.rb +3 -1
- data/lib/docdiff/encoding/ja_utf8.rb +3 -1
- data/lib/docdiff/version.rb +1 -1
- data/lib/docdiff/view.rb +4 -10
- data/lib/viewdiff.rb +9 -5
- data/readme.html +23 -3
- data/readme.md +184 -0
- data/test/charstring_test.rb +13 -26
- data/test/diff_test.rb +2 -1
- data/test/difference_test.rb +2 -1
- data/test/docdiff_test.rb +9 -2
- data/test/document_test.rb +4 -6
- data/test/view_test.rb +3 -1
- data/test/viewdiff_test.rb +14 -11
- metadata +23 -29
- data/devutil/JIS0208.TXT +0 -6952
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: f8733587f13d662ca5c5fda60843298ef71ef798284d9f677d03378d8b5e0e29
|
4
|
+
data.tar.gz: f92804fdb17576aaded799010553a0d6685dbb8679e53cbf6cf15c4773af8ddf
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 94af3213ec734c2b80ad72f70bb1d1312d14da7d15e77237f1824ade5d83f39d666bf8049dcb4c6a2898b402ab80b64c22184bb93359f23a8496f83d0e191f0b
|
7
|
+
data.tar.gz: dfd7a6f65ff88a556b5ccb5612daac005b3405cb0f36f64cc5f4914a061bdbd20c4ac4ab5610c657b9036f09485a18e912e527bc5c78b276d2a494bfc977455a
|
data/.travis.yml
CHANGED
data/Gemfile
CHANGED
data/Makefile
CHANGED
@@ -1,19 +1,21 @@
|
|
1
|
+
# Warning: this Makefile is obsolete, use Rakefile instead
|
2
|
+
|
1
3
|
PRODUCT = docdiff
|
2
|
-
VERSION =
|
4
|
+
VERSION = $(shell $(RUBY) -r./lib/docdiff/version.rb -e 'Docdiff::VERSION.display')
|
3
5
|
RUBY = ruby
|
4
6
|
TAR_XVCS = tar --exclude=.svn --exclude=.git
|
5
7
|
|
6
8
|
DOCS = ChangeLog readme.en.html readme.ja.html \
|
7
9
|
index.en.html index.ja.html
|
8
10
|
DOCSRC = readme.html index.html img sample
|
9
|
-
TESTS =
|
10
|
-
|
11
|
-
DIST = Makefile devutil docdiff docdiff.conf.example docdiff.rb \
|
11
|
+
TESTS = test/*_test.rb
|
12
|
+
DIST = Makefile devutil lib docdiff.conf.example bin/docdiff \
|
12
13
|
docdiff.gemspec \
|
13
14
|
docdiffwebui.html docdiffwebui.cgi \
|
14
15
|
$(DOCSRC) $(DOCS) $(TESTS)
|
15
|
-
TESTLOGS =
|
16
|
-
|
16
|
+
TESTLOGS = $(foreach t,\
|
17
|
+
$(wildcard test/*_test.rb),\
|
18
|
+
$(t:test/%_test.rb=%_test.log)) \
|
17
19
|
|
18
20
|
WWWUSER = hisashim,docdiff
|
19
21
|
WWWSITE = web.sourceforge.net
|
@@ -28,22 +30,16 @@ all: $(DOCS)
|
|
28
30
|
|
29
31
|
testall:
|
30
32
|
$(MAKE) test RUBY=ruby1.9.1
|
31
|
-
$(MAKE) test RUBY=ruby1.8
|
32
33
|
|
33
34
|
test: $(TESTLOGS)
|
34
35
|
|
35
|
-
|
36
|
-
$(RUBY) -I
|
36
|
+
%_test.log:
|
37
|
+
$(RUBY) -I./lib test/$*_test.rb | tee $@
|
37
38
|
|
38
39
|
docs: $(DOCS)
|
39
40
|
|
40
41
|
ChangeLog:
|
41
|
-
|
42
|
-
if [ -d .svn ] ; then \
|
43
|
-
svn log -rHEAD:0 -v > ChangeLog ; \
|
44
|
-
else \
|
45
|
-
git svn log > ChangeLog ; \
|
46
|
-
fi
|
42
|
+
devutil/changelog.sh > $@
|
47
43
|
|
48
44
|
readme.%.html: readme.html
|
49
45
|
$(RUBY) -Ku langfilter.rb --$* $< > $@
|
@@ -54,13 +50,13 @@ install: $(DIST)
|
|
54
50
|
@if [ ! -d $(DESTDIR)$(PREFIX)/bin ]; then \
|
55
51
|
mkdir -p $(DESTDIR)$(PREFIX)/bin; \
|
56
52
|
fi
|
57
|
-
cp -Ppv docdiff
|
53
|
+
cp -Ppv bin/docdiff $(DESTDIR)$(PREFIX)/bin/
|
58
54
|
chmod +x $(DESTDIR)$(PREFIX)/bin/docdiff
|
59
55
|
|
60
|
-
@if [ ! -d $(datadir)
|
61
|
-
mkdir -p $(datadir)
|
56
|
+
@if [ ! -d $(datadir)/$(PRODUCT) ]; then \
|
57
|
+
mkdir -p $(datadir)/$(PRODUCT); \
|
62
58
|
fi
|
63
|
-
($(TAR_XVCS) -cf -
|
59
|
+
(cd lib && $(TAR_XVCS) -cf - *) | (cd $(datadir)/$(PRODUCT) && tar -xpf -)
|
64
60
|
|
65
61
|
@if [ ! -d $(DESTDIR)/etc/$(PRODUCT) ]; then \
|
66
62
|
mkdir -p $(DESTDIR)/etc/$(PRODUCT); \
|
data/Rakefile
CHANGED
@@ -1,17 +1,52 @@
|
|
1
|
+
require 'rake/clean'
|
2
|
+
require 'rake/testtask'
|
1
3
|
require 'bundler/gem_tasks'
|
2
4
|
|
3
|
-
|
5
|
+
RUBY = ENV['RUBY'] ||= 'ruby'
|
6
|
+
DOCS = FileList['ChangeLog', 'readme.en.html', 'readme.ja.html',
|
7
|
+
'index.en.html', 'index.ja.html']
|
8
|
+
DOCSRC = FileList['readme.html', 'index.html', 'img', 'sample']
|
9
|
+
TESTS = FileList['test/*_test.rb']
|
10
|
+
TESTLOGS = Dir.glob('test/*_test.rb').map{|f|
|
11
|
+
File.basename(f).ext('log')
|
12
|
+
}
|
13
|
+
|
14
|
+
WWWUSER = ENV['WWWUSER'] ||= 'hisashim,docdiff'
|
15
|
+
WWWSITE = ENV['WWWSITE'] ||= 'web.sourceforge.net'
|
16
|
+
WWWSITEPATH = ENV['WWWSITEPATH'] ||= 'htdocs/'
|
17
|
+
WWWDRYRUN = ENV['WWWDRYRUN'] ||= '--dry-run'
|
18
|
+
|
4
19
|
Rake::TestTask.new do |t|
|
5
|
-
t.test_files =
|
20
|
+
t.test_files = TESTS
|
6
21
|
t.verbose = true
|
7
22
|
end
|
8
23
|
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
end
|
24
|
+
task :default => :test
|
25
|
+
|
26
|
+
desc "generate documents"
|
27
|
+
task :docs => DOCS
|
28
|
+
|
29
|
+
file 'ChangeLog' do |t|
|
30
|
+
sh "devutil/changelog.sh > #{t.name}"
|
17
31
|
end
|
32
|
+
|
33
|
+
rule(/.*\.(?:en|ja)\.html/ => proc{|tn| tn.gsub(/\.(?:en|ja)/, '')}) do |t|
|
34
|
+
sh "#{RUBY} -E UTF-8 langfilter.rb" +
|
35
|
+
" --#{t.name.gsub(/.*?\.(en|ja)\.html/){$1}}" +
|
36
|
+
" #{t.prerequisites.first} > #{t.name}"
|
37
|
+
end
|
38
|
+
|
39
|
+
desc "force to rsync web contents"
|
40
|
+
task :wwwupload do |t|
|
41
|
+
sh "rake www WWWDRYRUN="
|
42
|
+
end
|
43
|
+
|
44
|
+
desc "rsync web contents"
|
45
|
+
task :www => DOCSRC + DOCS do |t|
|
46
|
+
sh "rsync #{WWWDRYRUN} -auv -e ssh --delete" +
|
47
|
+
" --exclude='.svn' --exclude='.git'" +
|
48
|
+
t.prerequisites.join(' ') +
|
49
|
+
" #{WWWUSER}@#{WWWSITE}:#{WWWSITEPATH}"
|
50
|
+
end
|
51
|
+
|
52
|
+
CLEAN.include(DOCS, TESTLOGS)
|
data/bin/docdiff
CHANGED
@@ -1,11 +1,8 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
# DocDiff: word/character-oriented text comparison utility
|
3
3
|
# Copyright (C) 2002-2011 Hisashi MORITA
|
4
|
-
# Requirements: Ruby (>=
|
4
|
+
# Requirements: Ruby (>= 2.0)
|
5
5
|
require 'docdiff'
|
6
|
-
require 'docdiff/difference'
|
7
|
-
require 'docdiff/document'
|
8
|
-
require 'docdiff/view'
|
9
6
|
require 'optparse'
|
10
7
|
|
11
8
|
# do_config_stuff
|
@@ -40,7 +37,7 @@ ARGV.options {|o|
|
|
40
37
|
o.def_option('--char', 'set resolution to char'){clo[:resolution] = "char"}
|
41
38
|
|
42
39
|
o.def_option('--encoding=ENCODING',
|
43
|
-
possible_encodings = ['ASCII','EUC-JP','Shift_JIS','UTF-8','auto'],
|
40
|
+
possible_encodings = ['ASCII','EUC-JP','Shift_JIS','CP932','UTF-8','auto'],
|
44
41
|
'specify character encoding',
|
45
42
|
possible_encodings.join('|'), "(default is auto. try ASCII for single byte encodings such as ISO-8859-X)"
|
46
43
|
){|s| clo[:encoding] = (s || "auto")}
|
@@ -48,6 +45,7 @@ ARGV.options {|o|
|
|
48
45
|
o.def_option('--iso8859x', 'same as --encoding=ASCII'){clo[:encoding] = "ASCII"}
|
49
46
|
o.def_option('--eucjp', 'same as --encoding=EUC-JP'){clo[:encoding] = "EUC-JP"}
|
50
47
|
o.def_option('--sjis', 'same as --encoding=Shift_JIS'){clo[:encoding] = "Shift_JIS"}
|
48
|
+
o.def_option('--cp932', 'same as --encoding=CP932'){clo[:encoding] = "CP932"}
|
51
49
|
o.def_option('--utf8', 'same as --encoding=UTF-8'){clo[:encoding] = "UTF-8"}
|
52
50
|
|
53
51
|
o.def_option('--eol=EOL',
|
@@ -145,8 +143,8 @@ eol1 = docdiff.config[:eol]
|
|
145
143
|
eol2 = docdiff.config[:eol]
|
146
144
|
|
147
145
|
if docdiff.config[:encoding] == "auto"
|
148
|
-
encoding1 = CharString.guess_encoding(file1_content)
|
149
|
-
encoding2 = CharString.guess_encoding(file2_content)
|
146
|
+
encoding1 = DocDiff::CharString.guess_encoding(file1_content)
|
147
|
+
encoding2 = DocDiff::CharString.guess_encoding(file2_content)
|
150
148
|
case
|
151
149
|
when (encoding1 == "UNKNOWN" or encoding2 == "UNKNOWN")
|
152
150
|
raise "Document encoding unknown (#{encoding1}, #{encoding2})."
|
@@ -156,8 +154,8 @@ if docdiff.config[:encoding] == "auto"
|
|
156
154
|
end
|
157
155
|
|
158
156
|
if docdiff.config[:eol] == "auto"
|
159
|
-
eol1 = CharString.guess_eol(file1_content)
|
160
|
-
eol2 = CharString.guess_eol(file2_content)
|
157
|
+
eol1 = DocDiff::CharString.guess_eol(file1_content)
|
158
|
+
eol2 = DocDiff::CharString.guess_eol(file2_content)
|
161
159
|
case
|
162
160
|
when (eol1.nil? or eol2.nil?)
|
163
161
|
raise "Document eol is nil (#{eol1.inspect}, #{eol2.inspect}). The document might be empty."
|
@@ -168,8 +166,8 @@ if docdiff.config[:eol] == "auto"
|
|
168
166
|
end
|
169
167
|
end
|
170
168
|
|
171
|
-
doc1 = Document.new(file1_content, encoding1, eol1)
|
172
|
-
doc2 = Document.new(file2_content, encoding2, eol2)
|
169
|
+
doc1 = DocDiff::Document.new(file1_content, encoding1, eol1)
|
170
|
+
doc2 = DocDiff::Document.new(file2_content, encoding2, eol2)
|
173
171
|
|
174
172
|
output = docdiff.run(doc1, doc2,
|
175
173
|
{:resolution => docdiff.config[:resolution],
|
data/devutil/Rakefile
ADDED
@@ -0,0 +1,40 @@
|
|
1
|
+
#!/bin/sh
|
2
|
+
# ChangeLog Generator
|
3
|
+
# Copyright 2011 Hisashi Morita
|
4
|
+
# License: Public Domain
|
5
|
+
#
|
6
|
+
# Usage:
|
7
|
+
# changelog.sh [WORKING_DIR] > ChangeLog
|
8
|
+
|
9
|
+
if [ "$1" ]; then
|
10
|
+
WD="$1"
|
11
|
+
else
|
12
|
+
WD="."
|
13
|
+
fi
|
14
|
+
|
15
|
+
# Subversion
|
16
|
+
which svn >/dev/null
|
17
|
+
if [ x"$?" = x0 ]; then
|
18
|
+
(svn info "${WD}" >/dev/null 2>&1) && SVN=TRUE
|
19
|
+
if [ x"${SVN}" = xTRUE ]; then
|
20
|
+
(cd "${WD}"; svn log -rBASE:0 -v)
|
21
|
+
fi
|
22
|
+
fi
|
23
|
+
|
24
|
+
# Git
|
25
|
+
which git >/dev/null
|
26
|
+
if [ x"$?" = x0 ]; then
|
27
|
+
(cd "${WD}" && git status --porcelain >/dev/null 2>&1) && GIT=TRUE
|
28
|
+
if [ x"${GIT}" = xTRUE ]; then
|
29
|
+
(cd "${WD}"; git log | cat)
|
30
|
+
fi
|
31
|
+
fi
|
32
|
+
|
33
|
+
# Mercurial
|
34
|
+
which hg >/dev/null
|
35
|
+
if [ x"$?" = x0 ]; then
|
36
|
+
(hg status "${WD}" >/dev/null 2>&1) && HG=TRUE
|
37
|
+
if [ x"${HG}" = xTRUE ]; then
|
38
|
+
(cd "${WD}"; hg log --rev tip:0)
|
39
|
+
fi
|
40
|
+
fi
|
data/docdiff.gemspec
CHANGED
@@ -1,20 +1,20 @@
|
|
1
1
|
# -*- encoding: utf-8 -*-
|
2
|
-
$:.
|
2
|
+
$:.unshift File.expand_path("../lib", __FILE__)
|
3
3
|
require "docdiff/version"
|
4
4
|
|
5
5
|
Gem::Specification.new do |s|
|
6
6
|
s.name = "docdiff"
|
7
7
|
s.version = Docdiff::VERSION
|
8
|
+
s.license = "BSD-3-Clause"
|
8
9
|
s.authors = ["Hisashi Morita"]
|
9
|
-
s.email = ["hisashim at
|
10
|
-
s.homepage = "
|
10
|
+
s.email = ["hisashim at workbook.org"]
|
11
|
+
s.homepage = "https://github.com/hisashim/docdiff"
|
11
12
|
s.summary = %q{Word-by-word diff}
|
12
13
|
s.description = %q{DocDiff compares two text files and shows the
|
13
14
|
difference. It can compare files word by word,
|
14
15
|
character by character, or line by line. It has
|
15
16
|
several output formats such as HTML, tty, Manued,
|
16
17
|
or user-defined markup.}
|
17
|
-
s.rubyforge_project = "docdiff"
|
18
18
|
|
19
19
|
s.files = `git ls-files`.split("\n")
|
20
20
|
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
data/docdiffwebui.cgi
CHANGED
data/langfilter.rb
CHANGED
@@ -2,13 +2,9 @@
|
|
2
2
|
# language filter
|
3
3
|
# usage: langfilter.rb --en <infile >outfile
|
4
4
|
|
5
|
-
def ruby_m17n?
|
6
|
-
return true if "".respond_to? :encoding
|
7
|
-
end
|
8
|
-
|
9
5
|
lang_to_include = ARGV.shift.gsub(/-+/, "")
|
10
6
|
lang_to_exclude = {"en"=>"ja", "ja"=>"en"}[lang_to_include]
|
11
7
|
re = /<([a-z]+) +(?:(?:lang|title)="#{lang_to_exclude}").*?>.*?<\/\1>[\r\n]?/m
|
12
8
|
|
13
|
-
ARGF.set_encoding("UTF-8")
|
9
|
+
ARGF.set_encoding("UTF-8")
|
14
10
|
ARGF.read.gsub(re, "").display
|
data/lib/doc_diff.rb
CHANGED
@@ -1,6 +1,10 @@
|
|
1
1
|
# DocDiff: word/character-oriented text comparison utility
|
2
2
|
# Copyright (C) 2002-2011 Hisashi MORITA
|
3
|
-
# Requirements: Ruby (>=
|
3
|
+
# Requirements: Ruby (>= 2.0)
|
4
|
+
require 'docdiff/difference'
|
5
|
+
require 'docdiff/document'
|
6
|
+
require 'docdiff/view'
|
7
|
+
|
4
8
|
class DocDiff
|
5
9
|
|
6
10
|
AppVersion = Docdiff::VERSION
|
data/lib/docdiff.rb
CHANGED
data/lib/docdiff/charstring.rb
CHANGED
@@ -3,6 +3,7 @@
|
|
3
3
|
# To use, include to String, or extend String.
|
4
4
|
# 2003- Hisashi MORITA
|
5
5
|
|
6
|
+
class DocDiff
|
6
7
|
module CharString
|
7
8
|
|
8
9
|
Encodings = {}
|
@@ -72,9 +73,10 @@ module CharString
|
|
72
73
|
# returns 'CR', 'LF', 'CRLF', 'UNKNOWN'(binary),
|
73
74
|
# 'NONE'(1-line), or nil
|
74
75
|
return nil if string == nil #=> nil (argument missing)
|
75
|
-
|
76
|
-
|
77
|
-
'
|
76
|
+
bin_string = string.dup.force_encoding("ASCII-8BIT")
|
77
|
+
eol_counts = {'CR' => bin_string.scan(/(\r)(?!\n)/o).size,
|
78
|
+
'LF' => bin_string.scan(/(?:\A|[^\r])(\n)/o).size,
|
79
|
+
'CRLF' => bin_string.scan(/(\r\n)/o).size}
|
78
80
|
eol_counts.delete_if{|eol, count| count == 0} # Remove missing EOL
|
79
81
|
eols = eol_counts.keys
|
80
82
|
eol_variety = eols.size # numbers of flavors found
|
@@ -87,10 +89,6 @@ module CharString
|
|
87
89
|
end
|
88
90
|
end
|
89
91
|
|
90
|
-
def CharString.ruby_m17n?
|
91
|
-
"".respond_to?(:force_encoding)
|
92
|
-
end
|
93
|
-
|
94
92
|
# Note that some languages (like Japanese) do not have 'word' or 'phrase',
|
95
93
|
# thus some of the following methods are not 'linguistically correct'.
|
96
94
|
|
@@ -128,7 +126,6 @@ module CharString
|
|
128
126
|
}.compact.size
|
129
127
|
end
|
130
128
|
|
131
|
-
if ruby_m17n?
|
132
129
|
# for Ruby-1.9
|
133
130
|
def encoding()
|
134
131
|
String.new(self).encoding.to_s
|
@@ -254,280 +251,6 @@ if ruby_m17n?
|
|
254
251
|
require 'docdiff/encoding/ja_eucjp'
|
255
252
|
require 'docdiff/encoding/ja_sjis'
|
256
253
|
require 'docdiff/encoding/ja_utf8'
|
257
|
-
else
|
258
|
-
# for Ruby-1.8
|
259
|
-
require 'iconv'
|
260
|
-
|
261
|
-
def encoding()
|
262
|
-
@encoding
|
263
|
-
# if @encoding
|
264
|
-
# @encoding
|
265
|
-
# else
|
266
|
-
# @encoding = CharString.guess_encoding(self)
|
267
|
-
# # raise "encoding is not set.\n"
|
268
|
-
# end
|
269
|
-
end
|
270
|
-
|
271
|
-
def encoding=(cs)
|
272
|
-
@encoding = cs
|
273
|
-
extend Encodings[@encoding] # ; p "Hey, I extended #{Encodings[@encoding]}!"
|
274
|
-
end
|
275
|
-
|
276
|
-
# returns nil, 'US-ASCII', 'JIS', 'EUC-JP', 'Shift_JIS', 'UTF-8', or 'UNKNOWN'
|
277
|
-
def CharString.guess_encoding(string)
|
278
|
-
return nil if string == nil
|
279
|
-
result_using_pureruby = CharString.guess_encoding_using_pureruby(string)
|
280
|
-
result_using_iconv = CharString.guess_encoding_using_iconv(string)
|
281
|
-
if result_using_pureruby == result_using_iconv
|
282
|
-
result_using_pureruby
|
283
|
-
else
|
284
|
-
"UNKNOWN"
|
285
|
-
end
|
286
|
-
end
|
287
|
-
|
288
|
-
# returns nil, 'US-ASCII', 'JIS', 'EUC-JP', 'Shift_JIS', 'UTF-8', or 'UNKNOWN'
|
289
|
-
def CharString.guess_encoding_using_pureruby(string)
|
290
|
-
return nil if string == nil
|
291
|
-
|
292
|
-
ascii_pat = '[\x00-\x7f]'
|
293
|
-
jis_pat = ['(?:(?:\x1b\x28\x42)',
|
294
|
-
'|(?:\x1b\x28\x4a)',
|
295
|
-
'|(?:\x1b\x28\x49)',
|
296
|
-
'|(?:\x1b\x24\x40)',
|
297
|
-
'|(?:\x1b\x24\x42)',
|
298
|
-
'|(?:\x1b\x24\x44))'].join
|
299
|
-
eucjp_pat = ['(?:(?:[\x00-\x1f\x7f])',
|
300
|
-
'|(?:[\x20-\x7e])',
|
301
|
-
'|(?:\x8e[\xa1-\xdf])',
|
302
|
-
'|(?:[\xa1-\xfe][\xa1-\xfe])',
|
303
|
-
'|(?:\x8f[\xa1-\xfe][\xa1-\xfe]))'].join
|
304
|
-
sjis_pat = ['(?:(?:[\x00-\x1f\x7f])',
|
305
|
-
'|(?:[\x20-\x7e])',
|
306
|
-
'|(?:[\xa1-\xdf])',
|
307
|
-
'|(?:[\x81-\x9f][\x40-\x7e])',
|
308
|
-
'|(?:[\xe0-\xef][\x80-\xfc]))'].join
|
309
|
-
utf8_pat = ['(?:(?:[\x00-\x7f])',
|
310
|
-
'|(?:[\xc0-\xdf][\x80-\xbf])',
|
311
|
-
'|(?:[\xe0-\xef][\x80-\xbf][\x80-\xbf])',
|
312
|
-
'|(?:[\xf0-\xf7][\x80-\xbf][\x80-\xbf][\x80-\xbf]))'].join
|
313
|
-
|
314
|
-
ascii_match_length = string.scan(/#{ascii_pat}/on).join.length
|
315
|
-
jis_escseq_count = string.scan(/#{jis_pat}/on).size
|
316
|
-
eucjp_match_length = string.scan(/#{eucjp_pat}/no).join.length
|
317
|
-
sjis_match_length = string.scan(/#{sjis_pat}/no).join.length
|
318
|
-
utf8_match_length = string.scan(/#{utf8_pat}/no).join.length
|
319
|
-
|
320
|
-
case
|
321
|
-
when 0 < jis_escseq_count # JIS escape sequense found
|
322
|
-
guessed_encoding = 'JIS'
|
323
|
-
when ascii_match_length == string.length # every char is ASCII (but not JIS)
|
324
|
-
guessed_encoding = 'US-ASCII'
|
325
|
-
else
|
326
|
-
case
|
327
|
-
when eucjp_match_length < (string.length / 2) &&
|
328
|
-
sjis_match_length < (string.length / 2) &&
|
329
|
-
utf8_match_length < (string.length / 2)
|
330
|
-
guessed_encoding = 'UNKNOWN' # either encoding did not match long enough
|
331
|
-
when (eucjp_match_length < utf8_match_length) &&
|
332
|
-
(sjis_match_length < utf8_match_length)
|
333
|
-
guessed_encoding = 'UTF-8'
|
334
|
-
when (eucjp_match_length < sjis_match_length) &&
|
335
|
-
(utf8_match_length < sjis_match_length)
|
336
|
-
guessed_encoding = 'Shift_JIS'
|
337
|
-
when (sjis_match_length < eucjp_match_length) &&
|
338
|
-
(utf8_match_length < eucjp_match_length)
|
339
|
-
guessed_encoding = 'EUC-JP'
|
340
|
-
else
|
341
|
-
guessed_encoding = 'UNKNOWN' # cannot guess at all
|
342
|
-
end
|
343
|
-
end
|
344
|
-
return guessed_encoding
|
345
|
-
end
|
346
|
-
|
347
|
-
def CharString.guess_encoding_using_iconv(string)
|
348
|
-
valid_as_utf8 = CharString.valid_as("utf-8", string)
|
349
|
-
valid_as_sjis = CharString.valid_as("cp932", string) # not sjis, but cp932
|
350
|
-
valid_as_jis = CharString.valid_as("iso-2022-jp", string)
|
351
|
-
valid_as_eucjp = CharString.valid_as("eucjp", string)
|
352
|
-
valid_as_ascii = CharString.valid_as("ascii", string)
|
353
|
-
invalid_as_utf8 = CharString.invalid_as("utf-8", string)
|
354
|
-
invalid_as_sjis = CharString.invalid_as("cp932", string) # not sjis, but cp932
|
355
|
-
invalid_as_jis = CharString.invalid_as("iso-2022-jp", string)
|
356
|
-
invalid_as_eucjp = CharString.invalid_as("eucjp", string)
|
357
|
-
invalid_as_ascii = CharString.invalid_as("ascii", string)
|
358
|
-
case
|
359
|
-
when string == nil
|
360
|
-
nil
|
361
|
-
when valid_as_ascii
|
362
|
-
"US-ASCII"
|
363
|
-
when valid_as_jis # Iconv sometimes recognizes JIS for ASCII, ignoring JIS escape sequence.
|
364
|
-
"JIS"
|
365
|
-
when valid_as_eucjp
|
366
|
-
"EUC-JP"
|
367
|
-
when valid_as_sjis && invalid_as_utf8 && invalid_as_eucjp && invalid_as_jis
|
368
|
-
"Shift_JIS"
|
369
|
-
when valid_as_utf8 && invalid_as_sjis && invalid_as_eucjp && invalid_as_jis
|
370
|
-
"UTF-8"
|
371
|
-
else
|
372
|
-
"UNKNOWN"
|
373
|
-
end
|
374
|
-
end
|
375
|
-
|
376
|
-
def CharString.valid_as(encoding_name, string)
|
377
|
-
begin
|
378
|
-
Iconv.iconv(encoding_name, encoding_name, string)
|
379
|
-
rescue Iconv::IllegalSequence, Iconv::InvalidCharacter, Iconv::OutOfRange
|
380
|
-
return false
|
381
|
-
else
|
382
|
-
return true
|
383
|
-
end
|
384
|
-
end
|
385
|
-
|
386
|
-
def CharString.invalid_as(encoding_name, string)
|
387
|
-
if CharString.valid_as(encoding_name, string)
|
388
|
-
false
|
389
|
-
else
|
390
|
-
true
|
391
|
-
end
|
392
|
-
end
|
393
|
-
|
394
|
-
def split_to_byte()
|
395
|
-
scan(/./nm)
|
396
|
-
end
|
397
|
-
|
398
|
-
def split_to_char()
|
399
|
-
raise "Encodings[encoding] is #{Encodings[encoding].inspect}: encoding not specified or auto-detection failed." unless Encodings[encoding]
|
400
|
-
# raise "EOLChars[eol] is #{EOLChars[eol].inspect}: eol not specified or auto-detection failed." unless EOLChars[eol]
|
401
|
-
if eol_char # sometimes string has no end-of-line char
|
402
|
-
scan(Regexp.new("(?:#{eol_char})|(?:.)",
|
403
|
-
Regexp::MULTILINE,
|
404
|
-
encoding.sub(/ASCII/i, 'none'))
|
405
|
-
)
|
406
|
-
else # it seems that no EOL module was extended...
|
407
|
-
scan(Regexp.new("(?:.)",
|
408
|
-
Regexp::MULTILINE,
|
409
|
-
encoding.sub(/ASCII/i, 'none'))
|
410
|
-
)
|
411
|
-
end
|
412
|
-
end
|
413
|
-
|
414
|
-
def count_latin_graph_char()
|
415
|
-
raise "Encodings[encoding] is #{Encodings[encoding].inspect}: encoding not specified or auto-detection failed." unless Encodings[encoding]
|
416
|
-
# raise "EOLChars[eol] is #{EOLChars[eol].inspect}: eol not specified or auto-detection failed." unless EOLChars[eol]
|
417
|
-
scan(Regexp.new("[#{Encodings[encoding]::GRAPH}]",
|
418
|
-
Regexp::MULTILINE,
|
419
|
-
encoding.sub(/ASCII/i, 'none'))
|
420
|
-
).size
|
421
|
-
end
|
422
|
-
|
423
|
-
def count_ja_graph_char()
|
424
|
-
raise "Encodings[encoding] is #{Encodings[encoding].inspect}: encoding not specified or auto-detection failed." unless Encodings[encoding]
|
425
|
-
# raise "EOLChars[eol] is #{EOLChars[eol].inspect}: eol not specified or auto-detection failed." unless EOLChars[eol]
|
426
|
-
scan(Regexp.new("[#{Encodings[encoding]::JA_GRAPH}]",
|
427
|
-
Regexp::MULTILINE,
|
428
|
-
encoding.sub(/ASCII/i, 'none'))
|
429
|
-
).size
|
430
|
-
end
|
431
|
-
|
432
|
-
def count_latin_blank_char()
|
433
|
-
scan(Regexp.new("[#{Encodings[encoding]::BLANK}]",
|
434
|
-
Regexp::MULTILINE,
|
435
|
-
encoding.sub(/ASCII/i, 'none'))
|
436
|
-
).size
|
437
|
-
end
|
438
|
-
|
439
|
-
def count_ja_blank_char()
|
440
|
-
scan(Regexp.new("[#{Encodings[encoding]::JA_BLANK}]",
|
441
|
-
Regexp::MULTILINE,
|
442
|
-
encoding.sub(/ASCII/i, 'none'))
|
443
|
-
).size
|
444
|
-
end
|
445
|
-
|
446
|
-
def split_to_word()
|
447
|
-
raise "Encodings[encoding] is #{Encodings[encoding].inspect}: encoding not specified or auto-detection failed." unless Encodings[encoding]
|
448
|
-
# raise "EOLChars[eol] is #{EOLChars[eol].inspect}: eol not specified or auto-detection failed." unless EOLChars[eol]
|
449
|
-
scan(Regexp.new(Encodings[encoding]::WORD_REGEXP_SRC,
|
450
|
-
Regexp::MULTILINE,
|
451
|
-
encoding.sub(/ASCII/i, 'none'))
|
452
|
-
)
|
453
|
-
end
|
454
|
-
|
455
|
-
def count_latin_word()
|
456
|
-
split_to_word.collect{|word|
|
457
|
-
word if Regexp.new("[#{Encodings[encoding]::PRINT}]",
|
458
|
-
Regexp::MULTILINE,
|
459
|
-
encoding.sub(/ASCII/i, 'none')).match word
|
460
|
-
}.compact.size
|
461
|
-
end
|
462
|
-
|
463
|
-
def count_ja_word()
|
464
|
-
split_to_word.collect{|word|
|
465
|
-
word if Regexp.new("[#{Encodings[encoding]::JA_PRINT}]",
|
466
|
-
Regexp::MULTILINE,
|
467
|
-
encoding.sub(/ASCII/i, 'none')).match word
|
468
|
-
}.compact.size
|
469
|
-
end
|
470
|
-
|
471
|
-
def count_latin_valid_word()
|
472
|
-
split_to_word.collect{|word|
|
473
|
-
word if Regexp.new("[#{Encodings[encoding]::ALNUM}]",
|
474
|
-
Regexp::MULTILINE,
|
475
|
-
encoding.sub(/ASCII/i, 'none')).match word
|
476
|
-
}.compact.size
|
477
|
-
end
|
478
|
-
|
479
|
-
def count_ja_valid_word()
|
480
|
-
split_to_word.collect{|word|
|
481
|
-
word if Regexp.new("[#{Encodings[encoding]::JA_GRAPH}]",
|
482
|
-
Regexp::MULTILINE,
|
483
|
-
encoding.sub(/ASCII/i, 'none')).match word
|
484
|
-
}.compact.size
|
485
|
-
end
|
486
|
-
|
487
|
-
def split_to_line()
|
488
|
-
# scan(Regexp.new(".*?#{eol_char}|.+",
|
489
|
-
# Regexp::MULTILINE,
|
490
|
-
# encoding.sub(/ASCII/i, 'none'))
|
491
|
-
# )
|
492
|
-
raise "Encodings[encoding] is #{Encodings[encoding].inspect}: encoding not specified or auto-detection failed." unless Encodings[encoding]
|
493
|
-
raise "EOLChars[eol] is #{EOLChars[eol].inspect}: eol not specified or auto-detection failed." unless EOLChars[eol]
|
494
|
-
if defined? eol_char
|
495
|
-
scan(Regexp.new(".*?#{eol_char}|.+",
|
496
|
-
Regexp::MULTILINE,
|
497
|
-
encoding.sub(/ASCII/i, 'none'))
|
498
|
-
)
|
499
|
-
else
|
500
|
-
scan(Regexp.new(".+",
|
501
|
-
Regexp::MULTILINE,
|
502
|
-
encoding.sub(/ASCII/i, 'none'))
|
503
|
-
)
|
504
|
-
end
|
505
|
-
end
|
506
|
-
|
507
|
-
def count_graph_line()
|
508
|
-
split_to_line.collect{|line|
|
509
|
-
line if Regexp.new("[#{Encodings[encoding]::GRAPH}" +
|
510
|
-
"#{Encodings[encoding]::JA_GRAPH}]",
|
511
|
-
Regexp::MULTILINE,
|
512
|
-
encoding.sub(/ASCII/, 'none')).match line
|
513
|
-
}.compact.size
|
514
|
-
end
|
515
|
-
|
516
|
-
def count_blank_line()
|
517
|
-
split_to_line.collect{|line|
|
518
|
-
line if Regexp.new("^[#{Encodings[encoding]::BLANK}" +
|
519
|
-
"#{Encodings[encoding]::JA_BLANK}]+(?:#{eol_char})?",
|
520
|
-
Regexp::MULTILINE,
|
521
|
-
encoding.sub(/ASCII/, 'none')).match line
|
522
|
-
}.compact.size
|
523
|
-
end
|
524
|
-
|
525
|
-
# load encoding modules
|
526
|
-
require 'docdiff/encoding/en_ascii'
|
527
|
-
require 'docdiff/encoding/ja_eucjp'
|
528
|
-
require 'docdiff/encoding/ja_sjis'
|
529
|
-
require 'docdiff/encoding/ja_utf8'
|
530
|
-
end # end ruby_m17n?
|
531
254
|
alias to_bytes split_to_byte
|
532
255
|
alias to_chars split_to_char
|
533
256
|
alias to_words split_to_word
|
@@ -573,6 +296,7 @@ end # end ruby_m17n?
|
|
573
296
|
end
|
574
297
|
|
575
298
|
end # module CharString
|
299
|
+
end # class DocDiff
|
576
300
|
|
577
301
|
# class String
|
578
302
|
# include CharString
|