docdiff 0.5.0 → 0.6.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: f8733587f13d662ca5c5fda60843298ef71ef798284d9f677d03378d8b5e0e29
4
+ data.tar.gz: f92804fdb17576aaded799010553a0d6685dbb8679e53cbf6cf15c4773af8ddf
5
+ SHA512:
6
+ metadata.gz: 94af3213ec734c2b80ad72f70bb1d1312d14da7d15e77237f1824ade5d83f39d666bf8049dcb4c6a2898b402ab80b64c22184bb93359f23a8496f83d0e191f0b
7
+ data.tar.gz: dfd7a6f65ff88a556b5ccb5612daac005b3405cb0f36f64cc5f4914a061bdbd20c4ac4ab5610c657b9036f09485a18e912e527bc5c78b276d2a494bfc977455a
data/.travis.yml CHANGED
@@ -1,7 +1,9 @@
1
1
  rvm:
2
- - 1.8.7
3
- - 1.9.2
4
- - 1.9.3
2
+ - 2.0
3
+ - 2.1
4
+ - 2.2
5
+ - 2.3
6
+ - 2.4
5
7
  - ruby-head
6
8
 
7
9
  script: rake test
data/Gemfile CHANGED
@@ -1,4 +1,4 @@
1
- source :rubygems
1
+ source 'https://rubygems.org'
2
2
 
3
3
  group :darwin do
4
4
  gem 'rb-fsevent'
data/Makefile CHANGED
@@ -1,19 +1,21 @@
1
+ # Warning: this Makefile is obsolete, use Rakefile instead
2
+
1
3
  PRODUCT = docdiff
2
- VERSION = 0.4.0
4
+ VERSION = $(shell $(RUBY) -r./lib/docdiff/version.rb -e 'Docdiff::VERSION.display')
3
5
  RUBY = ruby
4
6
  TAR_XVCS = tar --exclude=.svn --exclude=.git
5
7
 
6
8
  DOCS = ChangeLog readme.en.html readme.ja.html \
7
9
  index.en.html index.ja.html
8
10
  DOCSRC = readme.html index.html img sample
9
- TESTS = testcharstring.rb testdiff.rb testdifference.rb \
10
- testdocdiff.rb testdocument.rb testview.rb
11
- DIST = Makefile devutil docdiff docdiff.conf.example docdiff.rb \
11
+ TESTS = test/*_test.rb
12
+ DIST = Makefile devutil lib docdiff.conf.example bin/docdiff \
12
13
  docdiff.gemspec \
13
14
  docdiffwebui.html docdiffwebui.cgi \
14
15
  $(DOCSRC) $(DOCS) $(TESTS)
15
- TESTLOGS = testdocdiff.log testcharstring.log testdocument.log \
16
- testdiff.log testdifference.log testview.log testviewdiff.log
16
+ TESTLOGS = $(foreach t,\
17
+ $(wildcard test/*_test.rb),\
18
+ $(t:test/%_test.rb=%_test.log)) \
17
19
 
18
20
  WWWUSER = hisashim,docdiff
19
21
  WWWSITE = web.sourceforge.net
@@ -28,22 +30,16 @@ all: $(DOCS)
28
30
 
29
31
  testall:
30
32
  $(MAKE) test RUBY=ruby1.9.1
31
- $(MAKE) test RUBY=ruby1.8
32
33
 
33
34
  test: $(TESTLOGS)
34
35
 
35
- test%.log:
36
- $(RUBY) -I. test/test$*.rb | tee $@
36
+ %_test.log:
37
+ $(RUBY) -I./lib test/$*_test.rb | tee $@
37
38
 
38
39
  docs: $(DOCS)
39
40
 
40
41
  ChangeLog:
41
- # For real ChangeLog style, try http://arthurdejong.org/svn2cl/
42
- if [ -d .svn ] ; then \
43
- svn log -rHEAD:0 -v > ChangeLog ; \
44
- else \
45
- git svn log > ChangeLog ; \
46
- fi
42
+ devutil/changelog.sh > $@
47
43
 
48
44
  readme.%.html: readme.html
49
45
  $(RUBY) -Ku langfilter.rb --$* $< > $@
@@ -54,13 +50,13 @@ install: $(DIST)
54
50
  @if [ ! -d $(DESTDIR)$(PREFIX)/bin ]; then \
55
51
  mkdir -p $(DESTDIR)$(PREFIX)/bin; \
56
52
  fi
57
- cp -Ppv docdiff.rb $(DESTDIR)$(PREFIX)/bin/docdiff
53
+ cp -Ppv bin/docdiff $(DESTDIR)$(PREFIX)/bin/
58
54
  chmod +x $(DESTDIR)$(PREFIX)/bin/docdiff
59
55
 
60
- @if [ ! -d $(datadir)$(PRODUCT) ]; then \
61
- mkdir -p $(datadir)$(PRODUCT); \
56
+ @if [ ! -d $(datadir)/$(PRODUCT) ]; then \
57
+ mkdir -p $(datadir)/$(PRODUCT); \
62
58
  fi
63
- ($(TAR_XVCS) -cf - docdiff) | (cd $(datadir)$(PRODUCT) && tar -xpf -)
59
+ (cd lib && $(TAR_XVCS) -cf - *) | (cd $(datadir)/$(PRODUCT) && tar -xpf -)
64
60
 
65
61
  @if [ ! -d $(DESTDIR)/etc/$(PRODUCT) ]; then \
66
62
  mkdir -p $(DESTDIR)/etc/$(PRODUCT); \
data/Rakefile CHANGED
@@ -1,17 +1,52 @@
1
+ require 'rake/clean'
2
+ require 'rake/testtask'
1
3
  require 'bundler/gem_tasks'
2
4
 
3
- require 'rake/testtask'
5
+ RUBY = ENV['RUBY'] ||= 'ruby'
6
+ DOCS = FileList['ChangeLog', 'readme.en.html', 'readme.ja.html',
7
+ 'index.en.html', 'index.ja.html']
8
+ DOCSRC = FileList['readme.html', 'index.html', 'img', 'sample']
9
+ TESTS = FileList['test/*_test.rb']
10
+ TESTLOGS = Dir.glob('test/*_test.rb').map{|f|
11
+ File.basename(f).ext('log')
12
+ }
13
+
14
+ WWWUSER = ENV['WWWUSER'] ||= 'hisashim,docdiff'
15
+ WWWSITE = ENV['WWWSITE'] ||= 'web.sourceforge.net'
16
+ WWWSITEPATH = ENV['WWWSITEPATH'] ||= 'htdocs/'
17
+ WWWDRYRUN = ENV['WWWDRYRUN'] ||= '--dry-run'
18
+
4
19
  Rake::TestTask.new do |t|
5
- t.test_files = FileList["test/test*.rb"]
20
+ t.test_files = TESTS
6
21
  t.verbose = true
7
22
  end
8
23
 
9
- if RUBY_VERSION < '1.9'
10
- require 'rcov/rcovtask'
11
- Rcov::RcovTask.new do |t|
12
- t.test_files = FileList['test/test*.rb']
13
- t.output_dir = 'coverage'
14
- t.rcov_opts = ["--exclude /gems/*"]
15
- t.verbose = true
16
- end
24
+ task :default => :test
25
+
26
+ desc "generate documents"
27
+ task :docs => DOCS
28
+
29
+ file 'ChangeLog' do |t|
30
+ sh "devutil/changelog.sh > #{t.name}"
17
31
  end
32
+
33
+ rule(/.*\.(?:en|ja)\.html/ => proc{|tn| tn.gsub(/\.(?:en|ja)/, '')}) do |t|
34
+ sh "#{RUBY} -E UTF-8 langfilter.rb" +
35
+ " --#{t.name.gsub(/.*?\.(en|ja)\.html/){$1}}" +
36
+ " #{t.prerequisites.first} > #{t.name}"
37
+ end
38
+
39
+ desc "force to rsync web contents"
40
+ task :wwwupload do |t|
41
+ sh "rake www WWWDRYRUN="
42
+ end
43
+
44
+ desc "rsync web contents"
45
+ task :www => DOCSRC + DOCS do |t|
46
+ sh "rsync #{WWWDRYRUN} -auv -e ssh --delete" +
47
+ " --exclude='.svn' --exclude='.git'" +
48
+ t.prerequisites.join(' ') +
49
+ " #{WWWUSER}@#{WWWSITE}:#{WWWSITEPATH}"
50
+ end
51
+
52
+ CLEAN.include(DOCS, TESTLOGS)
data/bin/docdiff CHANGED
@@ -1,11 +1,8 @@
1
1
  #!/usr/bin/env ruby
2
2
  # DocDiff: word/character-oriented text comparison utility
3
3
  # Copyright (C) 2002-2011 Hisashi MORITA
4
- # Requirements: Ruby (>= 1.8)
4
+ # Requirements: Ruby (>= 2.0)
5
5
  require 'docdiff'
6
- require 'docdiff/difference'
7
- require 'docdiff/document'
8
- require 'docdiff/view'
9
6
  require 'optparse'
10
7
 
11
8
  # do_config_stuff
@@ -40,7 +37,7 @@ ARGV.options {|o|
40
37
  o.def_option('--char', 'set resolution to char'){clo[:resolution] = "char"}
41
38
 
42
39
  o.def_option('--encoding=ENCODING',
43
- possible_encodings = ['ASCII','EUC-JP','Shift_JIS','UTF-8','auto'],
40
+ possible_encodings = ['ASCII','EUC-JP','Shift_JIS','CP932','UTF-8','auto'],
44
41
  'specify character encoding',
45
42
  possible_encodings.join('|'), "(default is auto. try ASCII for single byte encodings such as ISO-8859-X)"
46
43
  ){|s| clo[:encoding] = (s || "auto")}
@@ -48,6 +45,7 @@ ARGV.options {|o|
48
45
  o.def_option('--iso8859x', 'same as --encoding=ASCII'){clo[:encoding] = "ASCII"}
49
46
  o.def_option('--eucjp', 'same as --encoding=EUC-JP'){clo[:encoding] = "EUC-JP"}
50
47
  o.def_option('--sjis', 'same as --encoding=Shift_JIS'){clo[:encoding] = "Shift_JIS"}
48
+ o.def_option('--cp932', 'same as --encoding=CP932'){clo[:encoding] = "CP932"}
51
49
  o.def_option('--utf8', 'same as --encoding=UTF-8'){clo[:encoding] = "UTF-8"}
52
50
 
53
51
  o.def_option('--eol=EOL',
@@ -145,8 +143,8 @@ eol1 = docdiff.config[:eol]
145
143
  eol2 = docdiff.config[:eol]
146
144
 
147
145
  if docdiff.config[:encoding] == "auto"
148
- encoding1 = CharString.guess_encoding(file1_content)
149
- encoding2 = CharString.guess_encoding(file2_content)
146
+ encoding1 = DocDiff::CharString.guess_encoding(file1_content)
147
+ encoding2 = DocDiff::CharString.guess_encoding(file2_content)
150
148
  case
151
149
  when (encoding1 == "UNKNOWN" or encoding2 == "UNKNOWN")
152
150
  raise "Document encoding unknown (#{encoding1}, #{encoding2})."
@@ -156,8 +154,8 @@ if docdiff.config[:encoding] == "auto"
156
154
  end
157
155
 
158
156
  if docdiff.config[:eol] == "auto"
159
- eol1 = CharString.guess_eol(file1_content)
160
- eol2 = CharString.guess_eol(file2_content)
157
+ eol1 = DocDiff::CharString.guess_eol(file1_content)
158
+ eol2 = DocDiff::CharString.guess_eol(file2_content)
161
159
  case
162
160
  when (eol1.nil? or eol2.nil?)
163
161
  raise "Document eol is nil (#{eol1.inspect}, #{eol2.inspect}). The document might be empty."
@@ -168,8 +166,8 @@ if docdiff.config[:eol] == "auto"
168
166
  end
169
167
  end
170
168
 
171
- doc1 = Document.new(file1_content, encoding1, eol1)
172
- doc2 = Document.new(file2_content, encoding2, eol2)
169
+ doc1 = DocDiff::Document.new(file1_content, encoding1, eol1)
170
+ doc2 = DocDiff::Document.new(file2_content, encoding2, eol2)
173
171
 
174
172
  output = docdiff.run(doc1, doc2,
175
173
  {:resolution => docdiff.config[:resolution],
data/devutil/Rakefile ADDED
@@ -0,0 +1,9 @@
1
+ require 'rake/clean'
2
+
3
+ file 'JIS0208.TXT' do |t|
4
+ sh 'curl -O ftp://ftp.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/JIS0208.TXT'
5
+ end
6
+
7
+ task :default => 'JIS0208.TXT'
8
+
9
+ CLOBBER.include('JIS0208.TXT')
@@ -0,0 +1,40 @@
1
+ #!/bin/sh
2
+ # ChangeLog Generator
3
+ # Copyright 2011 Hisashi Morita
4
+ # License: Public Domain
5
+ #
6
+ # Usage:
7
+ # changelog.sh [WORKING_DIR] > ChangeLog
8
+
9
+ if [ "$1" ]; then
10
+ WD="$1"
11
+ else
12
+ WD="."
13
+ fi
14
+
15
+ # Subversion
16
+ which svn >/dev/null
17
+ if [ x"$?" = x0 ]; then
18
+ (svn info "${WD}" >/dev/null 2>&1) && SVN=TRUE
19
+ if [ x"${SVN}" = xTRUE ]; then
20
+ (cd "${WD}"; svn log -rBASE:0 -v)
21
+ fi
22
+ fi
23
+
24
+ # Git
25
+ which git >/dev/null
26
+ if [ x"$?" = x0 ]; then
27
+ (cd "${WD}" && git status --porcelain >/dev/null 2>&1) && GIT=TRUE
28
+ if [ x"${GIT}" = xTRUE ]; then
29
+ (cd "${WD}"; git log | cat)
30
+ fi
31
+ fi
32
+
33
+ # Mercurial
34
+ which hg >/dev/null
35
+ if [ x"$?" = x0 ]; then
36
+ (hg status "${WD}" >/dev/null 2>&1) && HG=TRUE
37
+ if [ x"${HG}" = xTRUE ]; then
38
+ (cd "${WD}"; hg log --rev tip:0)
39
+ fi
40
+ fi
data/docdiff.gemspec CHANGED
@@ -1,20 +1,20 @@
1
1
  # -*- encoding: utf-8 -*-
2
- $:.push File.expand_path("../lib", __FILE__)
2
+ $:.unshift File.expand_path("../lib", __FILE__)
3
3
  require "docdiff/version"
4
4
 
5
5
  Gem::Specification.new do |s|
6
6
  s.name = "docdiff"
7
7
  s.version = Docdiff::VERSION
8
+ s.license = "BSD-3-Clause"
8
9
  s.authors = ["Hisashi Morita"]
9
- s.email = ["hisashim at users.sourceforge.net"]
10
- s.homepage = "http://docdiff.sourceforge.net"
10
+ s.email = ["hisashim at workbook.org"]
11
+ s.homepage = "https://github.com/hisashim/docdiff"
11
12
  s.summary = %q{Word-by-word diff}
12
13
  s.description = %q{DocDiff compares two text files and shows the
13
14
  difference. It can compare files word by word,
14
15
  character by character, or line by line. It has
15
16
  several output formats such as HTML, tty, Manued,
16
17
  or user-defined markup.}
17
- s.rubyforge_project = "docdiff"
18
18
 
19
19
  s.files = `git ls-files`.split("\n")
20
20
  s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
data/docdiffwebui.cgi CHANGED
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/ruby
2
2
  # DocDiff Web UI (CGI)
3
3
  # 2005-10-08.. Hisashi Morita
4
- # requirement: Ruby 1.8+ (for timeout.rb)
4
+ # requirement: Ruby 2.0+ (for timeout.rb)
5
5
 
6
6
  require 'cgi'
7
7
  require 'tempfile'
data/langfilter.rb CHANGED
@@ -2,13 +2,9 @@
2
2
  # language filter
3
3
  # usage: langfilter.rb --en <infile >outfile
4
4
 
5
- def ruby_m17n?
6
- return true if "".respond_to? :encoding
7
- end
8
-
9
5
  lang_to_include = ARGV.shift.gsub(/-+/, "")
10
6
  lang_to_exclude = {"en"=>"ja", "ja"=>"en"}[lang_to_include]
11
7
  re = /<([a-z]+) +(?:(?:lang|title)="#{lang_to_exclude}").*?>.*?<\/\1>[\r\n]?/m
12
8
 
13
- ARGF.set_encoding("UTF-8") if ruby_m17n?
9
+ ARGF.set_encoding("UTF-8")
14
10
  ARGF.read.gsub(re, "").display
data/lib/doc_diff.rb CHANGED
@@ -1,6 +1,10 @@
1
1
  # DocDiff: word/character-oriented text comparison utility
2
2
  # Copyright (C) 2002-2011 Hisashi MORITA
3
- # Requirements: Ruby (>= 1.8)
3
+ # Requirements: Ruby (>= 2.0)
4
+ require 'docdiff/difference'
5
+ require 'docdiff/document'
6
+ require 'docdiff/view'
7
+
4
8
  class DocDiff
5
9
 
6
10
  AppVersion = Docdiff::VERSION
data/lib/docdiff.rb CHANGED
@@ -1,6 +1,6 @@
1
1
  # DocDiff: word/character-oriented text comparison utility
2
2
  # Copyright (C) 2002-2011 Hisashi MORITA
3
- # Requirements: Ruby (>= 1.8)
3
+ # Requirements: Ruby (>= 2.0)
4
4
  require 'docdiff/version'
5
5
  require 'doc_diff'
6
6
  module Docdiff
@@ -3,6 +3,7 @@
3
3
  # To use, include to String, or extend String.
4
4
  # 2003- Hisashi MORITA
5
5
 
6
+ class DocDiff
6
7
  module CharString
7
8
 
8
9
  Encodings = {}
@@ -72,9 +73,10 @@ module CharString
72
73
  # returns 'CR', 'LF', 'CRLF', 'UNKNOWN'(binary),
73
74
  # 'NONE'(1-line), or nil
74
75
  return nil if string == nil #=> nil (argument missing)
75
- eol_counts = {'CR' => string.scan(/(\r)(?!\n)/o).size,
76
- 'LF' => string.scan(/(?:\A|[^\r])(\n)/o).size,
77
- 'CRLF' => string.scan(/(\r\n)/o).size}
76
+ bin_string = string.dup.force_encoding("ASCII-8BIT")
77
+ eol_counts = {'CR' => bin_string.scan(/(\r)(?!\n)/o).size,
78
+ 'LF' => bin_string.scan(/(?:\A|[^\r])(\n)/o).size,
79
+ 'CRLF' => bin_string.scan(/(\r\n)/o).size}
78
80
  eol_counts.delete_if{|eol, count| count == 0} # Remove missing EOL
79
81
  eols = eol_counts.keys
80
82
  eol_variety = eols.size # numbers of flavors found
@@ -87,10 +89,6 @@ module CharString
87
89
  end
88
90
  end
89
91
 
90
- def CharString.ruby_m17n?
91
- "".respond_to?(:force_encoding)
92
- end
93
-
94
92
  # Note that some languages (like Japanese) do not have 'word' or 'phrase',
95
93
  # thus some of the following methods are not 'linguistically correct'.
96
94
 
@@ -128,7 +126,6 @@ module CharString
128
126
  }.compact.size
129
127
  end
130
128
 
131
- if ruby_m17n?
132
129
  # for Ruby-1.9
133
130
  def encoding()
134
131
  String.new(self).encoding.to_s
@@ -254,280 +251,6 @@ if ruby_m17n?
254
251
  require 'docdiff/encoding/ja_eucjp'
255
252
  require 'docdiff/encoding/ja_sjis'
256
253
  require 'docdiff/encoding/ja_utf8'
257
- else
258
- # for Ruby-1.8
259
- require 'iconv'
260
-
261
- def encoding()
262
- @encoding
263
- # if @encoding
264
- # @encoding
265
- # else
266
- # @encoding = CharString.guess_encoding(self)
267
- # # raise "encoding is not set.\n"
268
- # end
269
- end
270
-
271
- def encoding=(cs)
272
- @encoding = cs
273
- extend Encodings[@encoding] # ; p "Hey, I extended #{Encodings[@encoding]}!"
274
- end
275
-
276
- # returns nil, 'US-ASCII', 'JIS', 'EUC-JP', 'Shift_JIS', 'UTF-8', or 'UNKNOWN'
277
- def CharString.guess_encoding(string)
278
- return nil if string == nil
279
- result_using_pureruby = CharString.guess_encoding_using_pureruby(string)
280
- result_using_iconv = CharString.guess_encoding_using_iconv(string)
281
- if result_using_pureruby == result_using_iconv
282
- result_using_pureruby
283
- else
284
- "UNKNOWN"
285
- end
286
- end
287
-
288
- # returns nil, 'US-ASCII', 'JIS', 'EUC-JP', 'Shift_JIS', 'UTF-8', or 'UNKNOWN'
289
- def CharString.guess_encoding_using_pureruby(string)
290
- return nil if string == nil
291
-
292
- ascii_pat = '[\x00-\x7f]'
293
- jis_pat = ['(?:(?:\x1b\x28\x42)',
294
- '|(?:\x1b\x28\x4a)',
295
- '|(?:\x1b\x28\x49)',
296
- '|(?:\x1b\x24\x40)',
297
- '|(?:\x1b\x24\x42)',
298
- '|(?:\x1b\x24\x44))'].join
299
- eucjp_pat = ['(?:(?:[\x00-\x1f\x7f])',
300
- '|(?:[\x20-\x7e])',
301
- '|(?:\x8e[\xa1-\xdf])',
302
- '|(?:[\xa1-\xfe][\xa1-\xfe])',
303
- '|(?:\x8f[\xa1-\xfe][\xa1-\xfe]))'].join
304
- sjis_pat = ['(?:(?:[\x00-\x1f\x7f])',
305
- '|(?:[\x20-\x7e])',
306
- '|(?:[\xa1-\xdf])',
307
- '|(?:[\x81-\x9f][\x40-\x7e])',
308
- '|(?:[\xe0-\xef][\x80-\xfc]))'].join
309
- utf8_pat = ['(?:(?:[\x00-\x7f])',
310
- '|(?:[\xc0-\xdf][\x80-\xbf])',
311
- '|(?:[\xe0-\xef][\x80-\xbf][\x80-\xbf])',
312
- '|(?:[\xf0-\xf7][\x80-\xbf][\x80-\xbf][\x80-\xbf]))'].join
313
-
314
- ascii_match_length = string.scan(/#{ascii_pat}/on).join.length
315
- jis_escseq_count = string.scan(/#{jis_pat}/on).size
316
- eucjp_match_length = string.scan(/#{eucjp_pat}/no).join.length
317
- sjis_match_length = string.scan(/#{sjis_pat}/no).join.length
318
- utf8_match_length = string.scan(/#{utf8_pat}/no).join.length
319
-
320
- case
321
- when 0 < jis_escseq_count # JIS escape sequense found
322
- guessed_encoding = 'JIS'
323
- when ascii_match_length == string.length # every char is ASCII (but not JIS)
324
- guessed_encoding = 'US-ASCII'
325
- else
326
- case
327
- when eucjp_match_length < (string.length / 2) &&
328
- sjis_match_length < (string.length / 2) &&
329
- utf8_match_length < (string.length / 2)
330
- guessed_encoding = 'UNKNOWN' # either encoding did not match long enough
331
- when (eucjp_match_length < utf8_match_length) &&
332
- (sjis_match_length < utf8_match_length)
333
- guessed_encoding = 'UTF-8'
334
- when (eucjp_match_length < sjis_match_length) &&
335
- (utf8_match_length < sjis_match_length)
336
- guessed_encoding = 'Shift_JIS'
337
- when (sjis_match_length < eucjp_match_length) &&
338
- (utf8_match_length < eucjp_match_length)
339
- guessed_encoding = 'EUC-JP'
340
- else
341
- guessed_encoding = 'UNKNOWN' # cannot guess at all
342
- end
343
- end
344
- return guessed_encoding
345
- end
346
-
347
- def CharString.guess_encoding_using_iconv(string)
348
- valid_as_utf8 = CharString.valid_as("utf-8", string)
349
- valid_as_sjis = CharString.valid_as("cp932", string) # not sjis, but cp932
350
- valid_as_jis = CharString.valid_as("iso-2022-jp", string)
351
- valid_as_eucjp = CharString.valid_as("eucjp", string)
352
- valid_as_ascii = CharString.valid_as("ascii", string)
353
- invalid_as_utf8 = CharString.invalid_as("utf-8", string)
354
- invalid_as_sjis = CharString.invalid_as("cp932", string) # not sjis, but cp932
355
- invalid_as_jis = CharString.invalid_as("iso-2022-jp", string)
356
- invalid_as_eucjp = CharString.invalid_as("eucjp", string)
357
- invalid_as_ascii = CharString.invalid_as("ascii", string)
358
- case
359
- when string == nil
360
- nil
361
- when valid_as_ascii
362
- "US-ASCII"
363
- when valid_as_jis # Iconv sometimes recognizes JIS for ASCII, ignoring JIS escape sequence.
364
- "JIS"
365
- when valid_as_eucjp
366
- "EUC-JP"
367
- when valid_as_sjis && invalid_as_utf8 && invalid_as_eucjp && invalid_as_jis
368
- "Shift_JIS"
369
- when valid_as_utf8 && invalid_as_sjis && invalid_as_eucjp && invalid_as_jis
370
- "UTF-8"
371
- else
372
- "UNKNOWN"
373
- end
374
- end
375
-
376
- def CharString.valid_as(encoding_name, string)
377
- begin
378
- Iconv.iconv(encoding_name, encoding_name, string)
379
- rescue Iconv::IllegalSequence, Iconv::InvalidCharacter, Iconv::OutOfRange
380
- return false
381
- else
382
- return true
383
- end
384
- end
385
-
386
- def CharString.invalid_as(encoding_name, string)
387
- if CharString.valid_as(encoding_name, string)
388
- false
389
- else
390
- true
391
- end
392
- end
393
-
394
- def split_to_byte()
395
- scan(/./nm)
396
- end
397
-
398
- def split_to_char()
399
- raise "Encodings[encoding] is #{Encodings[encoding].inspect}: encoding not specified or auto-detection failed." unless Encodings[encoding]
400
- # raise "EOLChars[eol] is #{EOLChars[eol].inspect}: eol not specified or auto-detection failed." unless EOLChars[eol]
401
- if eol_char # sometimes string has no end-of-line char
402
- scan(Regexp.new("(?:#{eol_char})|(?:.)",
403
- Regexp::MULTILINE,
404
- encoding.sub(/ASCII/i, 'none'))
405
- )
406
- else # it seems that no EOL module was extended...
407
- scan(Regexp.new("(?:.)",
408
- Regexp::MULTILINE,
409
- encoding.sub(/ASCII/i, 'none'))
410
- )
411
- end
412
- end
413
-
414
- def count_latin_graph_char()
415
- raise "Encodings[encoding] is #{Encodings[encoding].inspect}: encoding not specified or auto-detection failed." unless Encodings[encoding]
416
- # raise "EOLChars[eol] is #{EOLChars[eol].inspect}: eol not specified or auto-detection failed." unless EOLChars[eol]
417
- scan(Regexp.new("[#{Encodings[encoding]::GRAPH}]",
418
- Regexp::MULTILINE,
419
- encoding.sub(/ASCII/i, 'none'))
420
- ).size
421
- end
422
-
423
- def count_ja_graph_char()
424
- raise "Encodings[encoding] is #{Encodings[encoding].inspect}: encoding not specified or auto-detection failed." unless Encodings[encoding]
425
- # raise "EOLChars[eol] is #{EOLChars[eol].inspect}: eol not specified or auto-detection failed." unless EOLChars[eol]
426
- scan(Regexp.new("[#{Encodings[encoding]::JA_GRAPH}]",
427
- Regexp::MULTILINE,
428
- encoding.sub(/ASCII/i, 'none'))
429
- ).size
430
- end
431
-
432
- def count_latin_blank_char()
433
- scan(Regexp.new("[#{Encodings[encoding]::BLANK}]",
434
- Regexp::MULTILINE,
435
- encoding.sub(/ASCII/i, 'none'))
436
- ).size
437
- end
438
-
439
- def count_ja_blank_char()
440
- scan(Regexp.new("[#{Encodings[encoding]::JA_BLANK}]",
441
- Regexp::MULTILINE,
442
- encoding.sub(/ASCII/i, 'none'))
443
- ).size
444
- end
445
-
446
- def split_to_word()
447
- raise "Encodings[encoding] is #{Encodings[encoding].inspect}: encoding not specified or auto-detection failed." unless Encodings[encoding]
448
- # raise "EOLChars[eol] is #{EOLChars[eol].inspect}: eol not specified or auto-detection failed." unless EOLChars[eol]
449
- scan(Regexp.new(Encodings[encoding]::WORD_REGEXP_SRC,
450
- Regexp::MULTILINE,
451
- encoding.sub(/ASCII/i, 'none'))
452
- )
453
- end
454
-
455
- def count_latin_word()
456
- split_to_word.collect{|word|
457
- word if Regexp.new("[#{Encodings[encoding]::PRINT}]",
458
- Regexp::MULTILINE,
459
- encoding.sub(/ASCII/i, 'none')).match word
460
- }.compact.size
461
- end
462
-
463
- def count_ja_word()
464
- split_to_word.collect{|word|
465
- word if Regexp.new("[#{Encodings[encoding]::JA_PRINT}]",
466
- Regexp::MULTILINE,
467
- encoding.sub(/ASCII/i, 'none')).match word
468
- }.compact.size
469
- end
470
-
471
- def count_latin_valid_word()
472
- split_to_word.collect{|word|
473
- word if Regexp.new("[#{Encodings[encoding]::ALNUM}]",
474
- Regexp::MULTILINE,
475
- encoding.sub(/ASCII/i, 'none')).match word
476
- }.compact.size
477
- end
478
-
479
- def count_ja_valid_word()
480
- split_to_word.collect{|word|
481
- word if Regexp.new("[#{Encodings[encoding]::JA_GRAPH}]",
482
- Regexp::MULTILINE,
483
- encoding.sub(/ASCII/i, 'none')).match word
484
- }.compact.size
485
- end
486
-
487
- def split_to_line()
488
- # scan(Regexp.new(".*?#{eol_char}|.+",
489
- # Regexp::MULTILINE,
490
- # encoding.sub(/ASCII/i, 'none'))
491
- # )
492
- raise "Encodings[encoding] is #{Encodings[encoding].inspect}: encoding not specified or auto-detection failed." unless Encodings[encoding]
493
- raise "EOLChars[eol] is #{EOLChars[eol].inspect}: eol not specified or auto-detection failed." unless EOLChars[eol]
494
- if defined? eol_char
495
- scan(Regexp.new(".*?#{eol_char}|.+",
496
- Regexp::MULTILINE,
497
- encoding.sub(/ASCII/i, 'none'))
498
- )
499
- else
500
- scan(Regexp.new(".+",
501
- Regexp::MULTILINE,
502
- encoding.sub(/ASCII/i, 'none'))
503
- )
504
- end
505
- end
506
-
507
- def count_graph_line()
508
- split_to_line.collect{|line|
509
- line if Regexp.new("[#{Encodings[encoding]::GRAPH}" +
510
- "#{Encodings[encoding]::JA_GRAPH}]",
511
- Regexp::MULTILINE,
512
- encoding.sub(/ASCII/, 'none')).match line
513
- }.compact.size
514
- end
515
-
516
- def count_blank_line()
517
- split_to_line.collect{|line|
518
- line if Regexp.new("^[#{Encodings[encoding]::BLANK}" +
519
- "#{Encodings[encoding]::JA_BLANK}]+(?:#{eol_char})?",
520
- Regexp::MULTILINE,
521
- encoding.sub(/ASCII/, 'none')).match line
522
- }.compact.size
523
- end
524
-
525
- # load encoding modules
526
- require 'docdiff/encoding/en_ascii'
527
- require 'docdiff/encoding/ja_eucjp'
528
- require 'docdiff/encoding/ja_sjis'
529
- require 'docdiff/encoding/ja_utf8'
530
- end # end ruby_m17n?
531
254
  alias to_bytes split_to_byte
532
255
  alias to_chars split_to_char
533
256
  alias to_words split_to_word
@@ -573,6 +296,7 @@ end # end ruby_m17n?
573
296
  end
574
297
 
575
298
  end # module CharString
299
+ end # class DocDiff
576
300
 
577
301
  # class String
578
302
  # include CharString