docdiff 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +6 -0
- data/.travis.yml +7 -0
- data/Gemfile +17 -0
- data/Guardfile +8 -0
- data/Makefile +108 -0
- data/Rakefile +17 -0
- data/bin/docdiff +179 -0
- data/devutil/JIS0208.TXT +6952 -0
- data/devutil/char_by_charclass.rb +23 -0
- data/devutil/charclass_by_char.rb +21 -0
- data/devutil/jis0208.rb +343 -0
- data/devutil/testjis0208.rb +38 -0
- data/docdiff.conf.example +22 -0
- data/docdiff.gemspec +23 -0
- data/docdiffwebui.cgi +176 -0
- data/docdiffwebui.html +123 -0
- data/img/docdiff-screenshot-format-html-digest-firefox.png +0 -0
- data/img/docdiff-screenshot-format-html-firefox.png +0 -0
- data/img/docdiff-screenshot-format-tty-cmdexe-en.png +0 -0
- data/img/docdiff-screenshot-format-tty-cmdexe-ja.png +0 -0
- data/img/docdiff-screenshot-format-tty-rxvtunicode-en.png +0 -0
- data/img/docdiff-screenshot-format-tty-rxvtunicode-ja.png +0 -0
- data/img/docdiff-screenshot-format-tty-xterm-en.png +0 -0
- data/img/docdiff-screenshot-format-tty-xterm-ja.png +0 -0
- data/img/docdiff-screenshot-resolution-linewordchar-xterm.png +0 -0
- data/index.html +181 -0
- data/langfilter.rb +14 -0
- data/lib/doc_diff.rb +170 -0
- data/lib/docdiff.rb +7 -0
- data/lib/docdiff/charstring.rb +579 -0
- data/lib/docdiff/diff.rb +217 -0
- data/lib/docdiff/diff/contours.rb +382 -0
- data/lib/docdiff/diff/editscript.rb +148 -0
- data/lib/docdiff/diff/rcsdiff.rb +107 -0
- data/lib/docdiff/diff/shortestpath.rb +93 -0
- data/lib/docdiff/diff/speculative.rb +40 -0
- data/lib/docdiff/diff/subsequence.rb +39 -0
- data/lib/docdiff/diff/unidiff.rb +124 -0
- data/lib/docdiff/difference.rb +92 -0
- data/lib/docdiff/document.rb +127 -0
- data/lib/docdiff/encoding/en_ascii.rb +97 -0
- data/lib/docdiff/encoding/ja_eucjp.rb +269 -0
- data/lib/docdiff/encoding/ja_sjis.rb +260 -0
- data/lib/docdiff/encoding/ja_utf8.rb +6974 -0
- data/lib/docdiff/version.rb +3 -0
- data/lib/docdiff/view.rb +476 -0
- data/lib/viewdiff.rb +375 -0
- data/readme.html +713 -0
- data/sample/01.en.ascii.cr +1 -0
- data/sample/01.en.ascii.crlf +2 -0
- data/sample/01.en.ascii.lf +2 -0
- data/sample/01.ja.eucjp.lf +2 -0
- data/sample/01.ja.sjis.cr +1 -0
- data/sample/01.ja.sjis.crlf +2 -0
- data/sample/01.ja.utf8.crlf +2 -0
- data/sample/02.en.ascii.cr +1 -0
- data/sample/02.en.ascii.crlf +2 -0
- data/sample/02.en.ascii.lf +2 -0
- data/sample/02.ja.eucjp.lf +2 -0
- data/sample/02.ja.sjis.cr +1 -0
- data/sample/02.ja.sjis.crlf +2 -0
- data/sample/02.ja.utf8.crlf +2 -0
- data/sample/humpty_dumpty01.ascii.lf +4 -0
- data/sample/humpty_dumpty02.ascii.lf +4 -0
- data/test/charstring_test.rb +1008 -0
- data/test/diff_test.rb +36 -0
- data/test/difference_test.rb +64 -0
- data/test/docdiff_test.rb +193 -0
- data/test/document_test.rb +626 -0
- data/test/test_helper.rb +7 -0
- data/test/view_test.rb +570 -0
- data/test/viewdiff_test.rb +908 -0
- metadata +129 -0
data/langfilter.rb
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
# language filter
|
3
|
+
# usage: langfilter.rb --en <infile >outfile
|
4
|
+
|
5
|
+
def ruby_m17n?
|
6
|
+
return true if "".respond_to? :encoding
|
7
|
+
end
|
8
|
+
|
9
|
+
lang_to_include = ARGV.shift.gsub(/-+/, "")
|
10
|
+
lang_to_exclude = {"en"=>"ja", "ja"=>"en"}[lang_to_include]
|
11
|
+
re = /<([a-z]+) +(?:(?:lang|title)="#{lang_to_exclude}").*?>.*?<\/\1>[\r\n]?/m
|
12
|
+
|
13
|
+
ARGF.set_encoding("UTF-8") if ruby_m17n?
|
14
|
+
ARGF.read.gsub(re, "").display
|
data/lib/doc_diff.rb
ADDED
@@ -0,0 +1,170 @@
|
|
1
|
+
# DocDiff: word/character-oriented text comparison utility
|
2
|
+
# Copyright (C) 2002-2011 Hisashi MORITA
|
3
|
+
# Requirements: Ruby (>= 1.8)
|
4
|
+
class DocDiff
|
5
|
+
|
6
|
+
AppVersion = Docdiff::VERSION
|
7
|
+
Author = "Copyright (C) 2002-2011 Hisashi MORITA.\n" +
|
8
|
+
"diff library originates from Ruby/CVS by TANAKA Akira.\n"
|
9
|
+
License = "This software is licensed under so-called modified BSD license.\n" +
|
10
|
+
"See the document for detail.\n"
|
11
|
+
SystemConfigFileName = File.join(File::Separator, "etc", "docdiff", "docdiff.conf")
|
12
|
+
UserConfigFileName = File.join(ENV['HOME'], "etc", "docdiff", "docdiff.conf")
|
13
|
+
AltUserConfigFileName = File.join(ENV['HOME'], ".docdiff", "docdiff.conf")
|
14
|
+
|
15
|
+
def initialize()
|
16
|
+
@config = {}
|
17
|
+
end
|
18
|
+
attr_accessor :config
|
19
|
+
|
20
|
+
def DocDiff.parse_config_file_content(content)
|
21
|
+
result = {}
|
22
|
+
return result if content.size <= 0
|
23
|
+
lines = content.dup.split(/\r\n|\r|\n/).compact
|
24
|
+
lines.collect!{|line| line.sub(/#.*$/, '')}
|
25
|
+
lines.collect!{|line| line.strip}
|
26
|
+
lines.delete_if{|line| line == ""}
|
27
|
+
lines.each{|line|
|
28
|
+
raise 'line does not include " = ".' unless /[\s]+=[\s]+/.match line
|
29
|
+
name_src, value_src = line.split(/[\s]+=[\s]+/)
|
30
|
+
raise "Invalid name: #{name_src.inspect}" if (/\s/.match name_src)
|
31
|
+
raise "Invalid value: #{value_src.inspect}" unless value_src.kind_of?(String)
|
32
|
+
name = name_src.intern
|
33
|
+
value = value_src
|
34
|
+
value = true if ['on','yes','true'].include? value_src.downcase
|
35
|
+
value = false if ['off','no','false'].include? value_src.downcase
|
36
|
+
value = value_src.to_i if /^[0-9]+$/.match value_src
|
37
|
+
result[name] = value
|
38
|
+
}
|
39
|
+
result
|
40
|
+
end
|
41
|
+
|
42
|
+
def compare_by_line(doc1, doc2)
|
43
|
+
Difference.new(doc1.split_to_line, doc2.split_to_line)
|
44
|
+
end
|
45
|
+
|
46
|
+
def compare_by_line_word(doc1, doc2)
|
47
|
+
lines = compare_by_line(doc1, doc2)
|
48
|
+
words = Difference.new
|
49
|
+
lines.each{|line|
|
50
|
+
if line.first == :change_elt
|
51
|
+
before_change = Document.new(line[1].join,
|
52
|
+
doc1.encoding, doc1.eol)
|
53
|
+
after_change = Document.new(line[2].join,
|
54
|
+
doc2.encoding, doc2.eol)
|
55
|
+
Difference.new(before_change.split_to_word,
|
56
|
+
after_change.split_to_word).each{|word|
|
57
|
+
words << word
|
58
|
+
}
|
59
|
+
else # :common_elt_elt, :del_elt, or :add_elt
|
60
|
+
words << line
|
61
|
+
end
|
62
|
+
}
|
63
|
+
words
|
64
|
+
end
|
65
|
+
|
66
|
+
# i know this implementation of recursion is so lame...
|
67
|
+
def compare_by_line_word_char(doc1, doc2)
|
68
|
+
lines = compare_by_line(doc1, doc2)
|
69
|
+
lines_and_words = Difference.new
|
70
|
+
lines.each{|line|
|
71
|
+
if line.first == :change_elt
|
72
|
+
before_change = Document.new(line[1].join,
|
73
|
+
doc1.encoding, doc1.eol)
|
74
|
+
after_change = Document.new(line[2].join,
|
75
|
+
doc2.encoding, doc2.eol)
|
76
|
+
Difference.new(before_change.split_to_word,
|
77
|
+
after_change.split_to_word).each{|word|
|
78
|
+
lines_and_words << word
|
79
|
+
}
|
80
|
+
else # :common_elt_elt, :del_elt, or :add_elt
|
81
|
+
lines_and_words << line
|
82
|
+
end
|
83
|
+
}
|
84
|
+
lines_words_and_chars = Difference.new
|
85
|
+
lines_and_words.each{|line_or_word|
|
86
|
+
if line_or_word.first == :change_elt
|
87
|
+
before_change = Document.new(line_or_word[1].join, doc1.encoding, doc1.eol)
|
88
|
+
after_change = Document.new(line_or_word[2].join, doc2.encoding, doc2.eol)
|
89
|
+
Difference.new(before_change.split_to_char, after_change.split_to_char).each{|char|
|
90
|
+
lines_words_and_chars << char
|
91
|
+
}
|
92
|
+
else # :common_elt_elt, :del_elt, or :add_elt
|
93
|
+
lines_words_and_chars << line_or_word
|
94
|
+
end
|
95
|
+
}
|
96
|
+
lines_words_and_chars
|
97
|
+
end
|
98
|
+
|
99
|
+
def run(doc1, doc2, option)
|
100
|
+
raise "option is nil" if option.nil?
|
101
|
+
raise "option[:resolution] is nil" if option[:resolution].nil?
|
102
|
+
raise "option[:format] is nil" if option[:format].nil?
|
103
|
+
case
|
104
|
+
when doc1.class == Document && doc2.class == Document # OK
|
105
|
+
when doc1.encoding != nil && doc2.encoding != nil # OK
|
106
|
+
when doc1.encoding == doc2.encoding && doc1.eol == doc2.eol # OK
|
107
|
+
else
|
108
|
+
raise("Error! Blame the author (doc1: #{doc1.encoding}, #{doc1.eol}, doc2: #{doc2.encoding}, #{doc2.eol}).")
|
109
|
+
end
|
110
|
+
|
111
|
+
case option[:resolution]
|
112
|
+
when "line"; then difference = compare_by_line(doc1, doc2)
|
113
|
+
when "word"; then difference = compare_by_line_word(doc1, doc2)
|
114
|
+
when "char"; then difference = compare_by_line_word_char(doc1, doc2)
|
115
|
+
else
|
116
|
+
raise "Unsupported resolution: #{option[:resolution].inspect}"
|
117
|
+
end
|
118
|
+
view = View.new(difference, doc1.encoding, doc1.eol)
|
119
|
+
user_tags = {:start_common => (@config[:tag_common_start] ||= ''),
|
120
|
+
:end_common => (@config[:tag_common_end] ||= ''),
|
121
|
+
:start_del => (@config[:tag_del_start] ||= ''),
|
122
|
+
:end_del => (@config[:tag_del_end] ||= ''),
|
123
|
+
:start_add => (@config[:tag_add_start] ||= ''),
|
124
|
+
:end_add => (@config[:tag_add_end] ||= ''),
|
125
|
+
:start_before_change => (@config[:tag_change_before_start] ||= ''),
|
126
|
+
:end_before_change => (@config[:tag_change_before_end] ||= ''),
|
127
|
+
:start_after_change => (@config[:tag_change_after_start] ||= ''),
|
128
|
+
:end_after_change => (@config[:tag_change_after_end] ||= '')}
|
129
|
+
case option[:digest]
|
130
|
+
when true
|
131
|
+
case option[:format]
|
132
|
+
when "tty"; then result = view.to_tty_digest(option)
|
133
|
+
when "html"; then result = view.to_html_digest(option)
|
134
|
+
when "manued"; then result = view.to_manued_digest(option)
|
135
|
+
when "wdiff"; then result = view.to_wdiff_digest(option)
|
136
|
+
when "stat"; then result = view.to_stat(option)
|
137
|
+
when "user"; then result = view.to_user_digest(user_tags)
|
138
|
+
else
|
139
|
+
raise "Unsupported output format: #{option[:format].inspect}."
|
140
|
+
end
|
141
|
+
when false
|
142
|
+
case option[:format]
|
143
|
+
when "tty"; then result = view.to_tty(option)
|
144
|
+
when "html"; then result = view.to_html(option)
|
145
|
+
when "manued"; then result = view.to_manued(option)
|
146
|
+
when "wdiff"; then result = view.to_wdiff(option)
|
147
|
+
when "stat"; then result = view.to_stat(option)
|
148
|
+
when "user"; then result = view.to_user(user_tags)
|
149
|
+
else
|
150
|
+
raise "Unsupported output format: #{option[:format].inspect}."
|
151
|
+
end
|
152
|
+
end
|
153
|
+
result.join
|
154
|
+
end
|
155
|
+
|
156
|
+
def process_config_file(filename)
|
157
|
+
file_content = nil
|
158
|
+
begin
|
159
|
+
File.open(filename, "r"){|f| file_content = f.read}
|
160
|
+
rescue Errno::ENOENT
|
161
|
+
message = "config file not found so not read."
|
162
|
+
ensure
|
163
|
+
if file_content != nil
|
164
|
+
self.config.update(DocDiff.parse_config_file_content(file_content))
|
165
|
+
end
|
166
|
+
end
|
167
|
+
message
|
168
|
+
end
|
169
|
+
|
170
|
+
end # class DocDiff
|
data/lib/docdiff.rb
ADDED
@@ -0,0 +1,579 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
# Character String module.
|
3
|
+
# To use, include to String, or extend String.
|
4
|
+
# 2003- Hisashi MORITA
|
5
|
+
|
6
|
+
module CharString
|
7
|
+
|
8
|
+
Encodings = {}
|
9
|
+
EOLChars = {} # End-of-line characters, such as CR, LF, CRLF.
|
10
|
+
|
11
|
+
def initialize(string)
|
12
|
+
=begin unnecessary
|
13
|
+
# @encoding = CharString.guess_encoding(string)
|
14
|
+
# @eol = CharString.guess_eol(string)
|
15
|
+
=end unnecessary
|
16
|
+
super
|
17
|
+
end
|
18
|
+
|
19
|
+
def eol()
|
20
|
+
@eol
|
21
|
+
# if @eol
|
22
|
+
# @eol
|
23
|
+
# else
|
24
|
+
# @eol = CharString.guess_eol(self)
|
25
|
+
# # raise "eol is not set.\n"
|
26
|
+
# end
|
27
|
+
end
|
28
|
+
|
29
|
+
def eol=(e)
|
30
|
+
@eol = e
|
31
|
+
extend EOLChars[@eol]
|
32
|
+
end
|
33
|
+
|
34
|
+
def eol_char()
|
35
|
+
if @eol_char
|
36
|
+
@eol_char
|
37
|
+
else
|
38
|
+
nil
|
39
|
+
# extend EOLChars[eol]
|
40
|
+
# eol_char
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def debug()
|
45
|
+
case
|
46
|
+
when @encoding == nil
|
47
|
+
raise "@encoding is nil."
|
48
|
+
when Encodings[@encoding] == nil
|
49
|
+
raise "Encodings[@encoding(=#{@encoding})] is nil."
|
50
|
+
when Encodings[@encoding].class != Module
|
51
|
+
raise "Encodings[@encoding].class(=#{Encodings[@encoding].class}) is not a module."
|
52
|
+
when @eol == nil
|
53
|
+
raise "@eol is nil."
|
54
|
+
when EOLChars[@eol] == nil
|
55
|
+
raise "EOLChars[@eol(=#{@eol})] is nil."
|
56
|
+
else
|
57
|
+
# should I do some alert?
|
58
|
+
end
|
59
|
+
["id: #{self.id}, class: #{self.class}, self: #{self}, ",
|
60
|
+
"module: #{Encodings[@encoding]}, #{EOLChars[@eol]}"].join
|
61
|
+
end
|
62
|
+
|
63
|
+
def CharString.register_encoding(mod)
|
64
|
+
Encodings[mod::Encoding] = mod
|
65
|
+
end
|
66
|
+
|
67
|
+
def CharString.register_eol(mod)
|
68
|
+
EOLChars[mod::EOL] = mod
|
69
|
+
end
|
70
|
+
|
71
|
+
def CharString.guess_eol(string)
|
72
|
+
# returns 'CR', 'LF', 'CRLF', 'UNKNOWN'(binary),
|
73
|
+
# 'NONE'(1-line), or nil
|
74
|
+
return nil if string == nil #=> nil (argument missing)
|
75
|
+
eol_counts = {'CR' => string.scan(/(\r)(?!\n)/o).size,
|
76
|
+
'LF' => string.scan(/(?:\A|[^\r])(\n)/o).size,
|
77
|
+
'CRLF' => string.scan(/(\r\n)/o).size}
|
78
|
+
eol_counts.delete_if{|eol, count| count == 0} # Remove missing EOL
|
79
|
+
eols = eol_counts.keys
|
80
|
+
eol_variety = eols.size # numbers of flavors found
|
81
|
+
if eol_variety == 1 # Only one type of EOL found
|
82
|
+
return eols[0] #=> 'CR', 'LF', or 'CRLF'
|
83
|
+
elsif eol_variety == 0 # No EOL found
|
84
|
+
return 'NONE' #=> 'NONE' (might be 1-line file)
|
85
|
+
else # Multiple types of EOL found
|
86
|
+
return 'UNKNOWN' #=> 'UNKNOWN' (might be binary data)
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
def CharString.ruby_m17n?
|
91
|
+
"".respond_to?(:force_encoding)
|
92
|
+
end
|
93
|
+
|
94
|
+
# Note that some languages (like Japanese) do not have 'word' or 'phrase',
|
95
|
+
# thus some of the following methods are not 'linguistically correct'.
|
96
|
+
|
97
|
+
def count_byte()
|
98
|
+
split_to_byte().size
|
99
|
+
end
|
100
|
+
|
101
|
+
def count_char() # eol = 1 char
|
102
|
+
split_to_char().size
|
103
|
+
end
|
104
|
+
|
105
|
+
def count_graph_char()
|
106
|
+
count_latin_graph_char() + count_ja_graph_char()
|
107
|
+
end
|
108
|
+
|
109
|
+
def count_blank_char()
|
110
|
+
count_latin_blank_char() + count_ja_blank_char()
|
111
|
+
end
|
112
|
+
|
113
|
+
def count_word()
|
114
|
+
split_to_word().size
|
115
|
+
end
|
116
|
+
|
117
|
+
def count_valid_word()
|
118
|
+
count_latin_valid_word() + count_ja_valid_word()
|
119
|
+
end
|
120
|
+
|
121
|
+
def count_line() # this is common to all encodings.
|
122
|
+
split_to_line.size
|
123
|
+
end
|
124
|
+
|
125
|
+
def count_empty_line()
|
126
|
+
split_to_line.collect{|line|
|
127
|
+
line if /^(?:#{eol_char})|^$/m.match line
|
128
|
+
}.compact.size
|
129
|
+
end
|
130
|
+
|
131
|
+
if ruby_m17n?
|
132
|
+
# for Ruby-1.9
|
133
|
+
def encoding()
|
134
|
+
String.new(self).encoding.to_s
|
135
|
+
end
|
136
|
+
|
137
|
+
def encoding=(cs)
|
138
|
+
force_encoding(cs) if self
|
139
|
+
end
|
140
|
+
|
141
|
+
def CharString.guess_encoding(string)
|
142
|
+
if string
|
143
|
+
string.encoding.to_s
|
144
|
+
else
|
145
|
+
nil
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
def split_to_byte()
|
150
|
+
encode("ASCII-8BIT").scan(/./nm)
|
151
|
+
end
|
152
|
+
|
153
|
+
def split_to_char()
|
154
|
+
if eol_char # sometimes string has no end-of-line char
|
155
|
+
encode('UTF-8').scan(Regexp.new("(?:#{eol_char})|(?:.)",
|
156
|
+
Regexp::MULTILINE)
|
157
|
+
).map{|e| e.encode(self.encoding)}
|
158
|
+
else # it seems that no EOL module was extended...
|
159
|
+
encode('UTF-8').scan(Regexp.new("(?:.)",
|
160
|
+
Regexp::MULTILINE)
|
161
|
+
).map{|e| e.encode(self.encoding)}
|
162
|
+
end
|
163
|
+
end
|
164
|
+
|
165
|
+
def count_latin_graph_char()
|
166
|
+
encode('UTF-8').scan(Regexp.new("[#{Encodings['UTF-8']::GRAPH}]",
|
167
|
+
Regexp::MULTILINE)
|
168
|
+
).size
|
169
|
+
end
|
170
|
+
|
171
|
+
def count_ja_graph_char()
|
172
|
+
encode('UTF-8').scan(Regexp.new("[#{Encodings['UTF-8']::JA_GRAPH}]",
|
173
|
+
Regexp::MULTILINE)
|
174
|
+
).size
|
175
|
+
end
|
176
|
+
|
177
|
+
def count_latin_blank_char()
|
178
|
+
encode('UTF-8').scan(Regexp.new("[#{Encodings['UTF-8']::BLANK}]",
|
179
|
+
Regexp::MULTILINE)
|
180
|
+
).size
|
181
|
+
end
|
182
|
+
|
183
|
+
def count_ja_blank_char()
|
184
|
+
encode('UTF-8').scan(Regexp.new("[#{Encodings['UTF-8']::JA_BLANK}]",
|
185
|
+
Regexp::MULTILINE)
|
186
|
+
).size
|
187
|
+
end
|
188
|
+
|
189
|
+
def split_to_word()
|
190
|
+
encode('UTF-8').scan(Regexp.new(Encodings['UTF-8']::WORD_REGEXP_SRC,
|
191
|
+
Regexp::MULTILINE)
|
192
|
+
).map{|e| e.encode(self.encoding)}
|
193
|
+
end
|
194
|
+
|
195
|
+
def count_latin_word()
|
196
|
+
split_to_word.collect{|word|
|
197
|
+
word if Regexp.new("[#{Encodings['UTF-8']::PRINT}]",
|
198
|
+
Regexp::MULTILINE).match word.encode('UTF-8')
|
199
|
+
}.compact.size
|
200
|
+
end
|
201
|
+
|
202
|
+
def count_ja_word()
|
203
|
+
split_to_word.collect{|word|
|
204
|
+
word if Regexp.new("[#{Encodings['UTF-8']::JA_PRINT}]",
|
205
|
+
Regexp::MULTILINE).match word.encode('UTF-8')
|
206
|
+
}.compact.size
|
207
|
+
end
|
208
|
+
|
209
|
+
def count_latin_valid_word()
|
210
|
+
split_to_word.collect{|word|
|
211
|
+
word if Regexp.new("[#{Encodings['UTF-8']::ALNUM}]",
|
212
|
+
Regexp::MULTILINE).match word.encode('UTF-8')
|
213
|
+
}.compact.size
|
214
|
+
end
|
215
|
+
|
216
|
+
def count_ja_valid_word()
|
217
|
+
split_to_word.collect{|word|
|
218
|
+
word if Regexp.new("[#{Encodings['UTF-8']::JA_GRAPH}]",
|
219
|
+
Regexp::MULTILINE).match word.encode('UTF-8')
|
220
|
+
}.compact.size
|
221
|
+
end
|
222
|
+
|
223
|
+
def split_to_line()
|
224
|
+
raise "EOLChars[eol] is #{EOLChars[eol].inspect}: eol not specified or auto-detection failed." unless EOLChars[eol]
|
225
|
+
if defined? eol_char
|
226
|
+
encode('UTF-8').scan(Regexp.new(".*?#{eol_char}|.+",
|
227
|
+
Regexp::MULTILINE)
|
228
|
+
).map{|e| e.encode(self.encoding)}
|
229
|
+
else
|
230
|
+
encode('UTF-8').scan(Regexp.new(".+",
|
231
|
+
Regexp::MULTILINE)
|
232
|
+
).map{|e| e.encode(self.encoding)}
|
233
|
+
end
|
234
|
+
end
|
235
|
+
|
236
|
+
def count_graph_line()
|
237
|
+
split_to_line.collect{|line|
|
238
|
+
line if Regexp.new("[#{Encodings['UTF-8']::GRAPH}" +
|
239
|
+
"#{Encodings['UTF-8']::JA_GRAPH}]",
|
240
|
+
Regexp::MULTILINE).match line.encode('UTF-8')
|
241
|
+
}.compact.size
|
242
|
+
end
|
243
|
+
|
244
|
+
def count_blank_line()
|
245
|
+
split_to_line.collect{|line|
|
246
|
+
line if Regexp.new("^[#{Encodings['UTF-8']::BLANK}" +
|
247
|
+
"#{Encodings['UTF-8']::JA_BLANK}]+(?:#{eol_char})?",
|
248
|
+
Regexp::MULTILINE).match line.encode('UTF-8')
|
249
|
+
}.compact.size
|
250
|
+
end
|
251
|
+
|
252
|
+
# load encoding modules
|
253
|
+
require 'docdiff/encoding/en_ascii'
|
254
|
+
require 'docdiff/encoding/ja_eucjp'
|
255
|
+
require 'docdiff/encoding/ja_sjis'
|
256
|
+
require 'docdiff/encoding/ja_utf8'
|
257
|
+
else
|
258
|
+
# for Ruby-1.8
|
259
|
+
require 'iconv'
|
260
|
+
|
261
|
+
def encoding()
|
262
|
+
@encoding
|
263
|
+
# if @encoding
|
264
|
+
# @encoding
|
265
|
+
# else
|
266
|
+
# @encoding = CharString.guess_encoding(self)
|
267
|
+
# # raise "encoding is not set.\n"
|
268
|
+
# end
|
269
|
+
end
|
270
|
+
|
271
|
+
def encoding=(cs)
|
272
|
+
@encoding = cs
|
273
|
+
extend Encodings[@encoding] # ; p "Hey, I extended #{Encodings[@encoding]}!"
|
274
|
+
end
|
275
|
+
|
276
|
+
# returns nil, 'US-ASCII', 'JIS', 'EUC-JP', 'Shift_JIS', 'UTF-8', or 'UNKNOWN'
|
277
|
+
def CharString.guess_encoding(string)
|
278
|
+
return nil if string == nil
|
279
|
+
result_using_pureruby = CharString.guess_encoding_using_pureruby(string)
|
280
|
+
result_using_iconv = CharString.guess_encoding_using_iconv(string)
|
281
|
+
if result_using_pureruby == result_using_iconv
|
282
|
+
result_using_pureruby
|
283
|
+
else
|
284
|
+
"UNKNOWN"
|
285
|
+
end
|
286
|
+
end
|
287
|
+
|
288
|
+
# returns nil, 'US-ASCII', 'JIS', 'EUC-JP', 'Shift_JIS', 'UTF-8', or 'UNKNOWN'
|
289
|
+
def CharString.guess_encoding_using_pureruby(string)
|
290
|
+
return nil if string == nil
|
291
|
+
|
292
|
+
ascii_pat = '[\x00-\x7f]'
|
293
|
+
jis_pat = ['(?:(?:\x1b\x28\x42)',
|
294
|
+
'|(?:\x1b\x28\x4a)',
|
295
|
+
'|(?:\x1b\x28\x49)',
|
296
|
+
'|(?:\x1b\x24\x40)',
|
297
|
+
'|(?:\x1b\x24\x42)',
|
298
|
+
'|(?:\x1b\x24\x44))'].join
|
299
|
+
eucjp_pat = ['(?:(?:[\x00-\x1f\x7f])',
|
300
|
+
'|(?:[\x20-\x7e])',
|
301
|
+
'|(?:\x8e[\xa1-\xdf])',
|
302
|
+
'|(?:[\xa1-\xfe][\xa1-\xfe])',
|
303
|
+
'|(?:\x8f[\xa1-\xfe][\xa1-\xfe]))'].join
|
304
|
+
sjis_pat = ['(?:(?:[\x00-\x1f\x7f])',
|
305
|
+
'|(?:[\x20-\x7e])',
|
306
|
+
'|(?:[\xa1-\xdf])',
|
307
|
+
'|(?:[\x81-\x9f][\x40-\x7e])',
|
308
|
+
'|(?:[\xe0-\xef][\x80-\xfc]))'].join
|
309
|
+
utf8_pat = ['(?:(?:[\x00-\x7f])',
|
310
|
+
'|(?:[\xc0-\xdf][\x80-\xbf])',
|
311
|
+
'|(?:[\xe0-\xef][\x80-\xbf][\x80-\xbf])',
|
312
|
+
'|(?:[\xf0-\xf7][\x80-\xbf][\x80-\xbf][\x80-\xbf]))'].join
|
313
|
+
|
314
|
+
ascii_match_length = string.scan(/#{ascii_pat}/on).join.length
|
315
|
+
jis_escseq_count = string.scan(/#{jis_pat}/on).size
|
316
|
+
eucjp_match_length = string.scan(/#{eucjp_pat}/no).join.length
|
317
|
+
sjis_match_length = string.scan(/#{sjis_pat}/no).join.length
|
318
|
+
utf8_match_length = string.scan(/#{utf8_pat}/no).join.length
|
319
|
+
|
320
|
+
case
|
321
|
+
when 0 < jis_escseq_count # JIS escape sequense found
|
322
|
+
guessed_encoding = 'JIS'
|
323
|
+
when ascii_match_length == string.length # every char is ASCII (but not JIS)
|
324
|
+
guessed_encoding = 'US-ASCII'
|
325
|
+
else
|
326
|
+
case
|
327
|
+
when eucjp_match_length < (string.length / 2) &&
|
328
|
+
sjis_match_length < (string.length / 2) &&
|
329
|
+
utf8_match_length < (string.length / 2)
|
330
|
+
guessed_encoding = 'UNKNOWN' # either encoding did not match long enough
|
331
|
+
when (eucjp_match_length < utf8_match_length) &&
|
332
|
+
(sjis_match_length < utf8_match_length)
|
333
|
+
guessed_encoding = 'UTF-8'
|
334
|
+
when (eucjp_match_length < sjis_match_length) &&
|
335
|
+
(utf8_match_length < sjis_match_length)
|
336
|
+
guessed_encoding = 'Shift_JIS'
|
337
|
+
when (sjis_match_length < eucjp_match_length) &&
|
338
|
+
(utf8_match_length < eucjp_match_length)
|
339
|
+
guessed_encoding = 'EUC-JP'
|
340
|
+
else
|
341
|
+
guessed_encoding = 'UNKNOWN' # cannot guess at all
|
342
|
+
end
|
343
|
+
end
|
344
|
+
return guessed_encoding
|
345
|
+
end
|
346
|
+
|
347
|
+
def CharString.guess_encoding_using_iconv(string)
|
348
|
+
valid_as_utf8 = CharString.valid_as("utf-8", string)
|
349
|
+
valid_as_sjis = CharString.valid_as("cp932", string) # not sjis, but cp932
|
350
|
+
valid_as_jis = CharString.valid_as("iso-2022-jp", string)
|
351
|
+
valid_as_eucjp = CharString.valid_as("eucjp", string)
|
352
|
+
valid_as_ascii = CharString.valid_as("ascii", string)
|
353
|
+
invalid_as_utf8 = CharString.invalid_as("utf-8", string)
|
354
|
+
invalid_as_sjis = CharString.invalid_as("cp932", string) # not sjis, but cp932
|
355
|
+
invalid_as_jis = CharString.invalid_as("iso-2022-jp", string)
|
356
|
+
invalid_as_eucjp = CharString.invalid_as("eucjp", string)
|
357
|
+
invalid_as_ascii = CharString.invalid_as("ascii", string)
|
358
|
+
case
|
359
|
+
when string == nil
|
360
|
+
nil
|
361
|
+
when valid_as_ascii
|
362
|
+
"US-ASCII"
|
363
|
+
when valid_as_jis # Iconv sometimes recognizes JIS for ASCII, ignoring JIS escape sequence.
|
364
|
+
"JIS"
|
365
|
+
when valid_as_eucjp
|
366
|
+
"EUC-JP"
|
367
|
+
when valid_as_sjis && invalid_as_utf8 && invalid_as_eucjp && invalid_as_jis
|
368
|
+
"Shift_JIS"
|
369
|
+
when valid_as_utf8 && invalid_as_sjis && invalid_as_eucjp && invalid_as_jis
|
370
|
+
"UTF-8"
|
371
|
+
else
|
372
|
+
"UNKNOWN"
|
373
|
+
end
|
374
|
+
end
|
375
|
+
|
376
|
+
def CharString.valid_as(encoding_name, string)
|
377
|
+
begin
|
378
|
+
Iconv.iconv(encoding_name, encoding_name, string)
|
379
|
+
rescue Iconv::IllegalSequence, Iconv::InvalidCharacter, Iconv::OutOfRange
|
380
|
+
return false
|
381
|
+
else
|
382
|
+
return true
|
383
|
+
end
|
384
|
+
end
|
385
|
+
|
386
|
+
def CharString.invalid_as(encoding_name, string)
|
387
|
+
if CharString.valid_as(encoding_name, string)
|
388
|
+
false
|
389
|
+
else
|
390
|
+
true
|
391
|
+
end
|
392
|
+
end
|
393
|
+
|
394
|
+
def split_to_byte()
|
395
|
+
scan(/./nm)
|
396
|
+
end
|
397
|
+
|
398
|
+
def split_to_char()
|
399
|
+
raise "Encodings[encoding] is #{Encodings[encoding].inspect}: encoding not specified or auto-detection failed." unless Encodings[encoding]
|
400
|
+
# raise "EOLChars[eol] is #{EOLChars[eol].inspect}: eol not specified or auto-detection failed." unless EOLChars[eol]
|
401
|
+
if eol_char # sometimes string has no end-of-line char
|
402
|
+
scan(Regexp.new("(?:#{eol_char})|(?:.)",
|
403
|
+
Regexp::MULTILINE,
|
404
|
+
encoding.sub(/ASCII/i, 'none'))
|
405
|
+
)
|
406
|
+
else # it seems that no EOL module was extended...
|
407
|
+
scan(Regexp.new("(?:.)",
|
408
|
+
Regexp::MULTILINE,
|
409
|
+
encoding.sub(/ASCII/i, 'none'))
|
410
|
+
)
|
411
|
+
end
|
412
|
+
end
|
413
|
+
|
414
|
+
def count_latin_graph_char()
|
415
|
+
raise "Encodings[encoding] is #{Encodings[encoding].inspect}: encoding not specified or auto-detection failed." unless Encodings[encoding]
|
416
|
+
# raise "EOLChars[eol] is #{EOLChars[eol].inspect}: eol not specified or auto-detection failed." unless EOLChars[eol]
|
417
|
+
scan(Regexp.new("[#{Encodings[encoding]::GRAPH}]",
|
418
|
+
Regexp::MULTILINE,
|
419
|
+
encoding.sub(/ASCII/i, 'none'))
|
420
|
+
).size
|
421
|
+
end
|
422
|
+
|
423
|
+
def count_ja_graph_char()
|
424
|
+
raise "Encodings[encoding] is #{Encodings[encoding].inspect}: encoding not specified or auto-detection failed." unless Encodings[encoding]
|
425
|
+
# raise "EOLChars[eol] is #{EOLChars[eol].inspect}: eol not specified or auto-detection failed." unless EOLChars[eol]
|
426
|
+
scan(Regexp.new("[#{Encodings[encoding]::JA_GRAPH}]",
|
427
|
+
Regexp::MULTILINE,
|
428
|
+
encoding.sub(/ASCII/i, 'none'))
|
429
|
+
).size
|
430
|
+
end
|
431
|
+
|
432
|
+
def count_latin_blank_char()
|
433
|
+
scan(Regexp.new("[#{Encodings[encoding]::BLANK}]",
|
434
|
+
Regexp::MULTILINE,
|
435
|
+
encoding.sub(/ASCII/i, 'none'))
|
436
|
+
).size
|
437
|
+
end
|
438
|
+
|
439
|
+
def count_ja_blank_char()
|
440
|
+
scan(Regexp.new("[#{Encodings[encoding]::JA_BLANK}]",
|
441
|
+
Regexp::MULTILINE,
|
442
|
+
encoding.sub(/ASCII/i, 'none'))
|
443
|
+
).size
|
444
|
+
end
|
445
|
+
|
446
|
+
def split_to_word()
|
447
|
+
raise "Encodings[encoding] is #{Encodings[encoding].inspect}: encoding not specified or auto-detection failed." unless Encodings[encoding]
|
448
|
+
# raise "EOLChars[eol] is #{EOLChars[eol].inspect}: eol not specified or auto-detection failed." unless EOLChars[eol]
|
449
|
+
scan(Regexp.new(Encodings[encoding]::WORD_REGEXP_SRC,
|
450
|
+
Regexp::MULTILINE,
|
451
|
+
encoding.sub(/ASCII/i, 'none'))
|
452
|
+
)
|
453
|
+
end
|
454
|
+
|
455
|
+
def count_latin_word()
|
456
|
+
split_to_word.collect{|word|
|
457
|
+
word if Regexp.new("[#{Encodings[encoding]::PRINT}]",
|
458
|
+
Regexp::MULTILINE,
|
459
|
+
encoding.sub(/ASCII/i, 'none')).match word
|
460
|
+
}.compact.size
|
461
|
+
end
|
462
|
+
|
463
|
+
def count_ja_word()
|
464
|
+
split_to_word.collect{|word|
|
465
|
+
word if Regexp.new("[#{Encodings[encoding]::JA_PRINT}]",
|
466
|
+
Regexp::MULTILINE,
|
467
|
+
encoding.sub(/ASCII/i, 'none')).match word
|
468
|
+
}.compact.size
|
469
|
+
end
|
470
|
+
|
471
|
+
def count_latin_valid_word()
|
472
|
+
split_to_word.collect{|word|
|
473
|
+
word if Regexp.new("[#{Encodings[encoding]::ALNUM}]",
|
474
|
+
Regexp::MULTILINE,
|
475
|
+
encoding.sub(/ASCII/i, 'none')).match word
|
476
|
+
}.compact.size
|
477
|
+
end
|
478
|
+
|
479
|
+
def count_ja_valid_word()
|
480
|
+
split_to_word.collect{|word|
|
481
|
+
word if Regexp.new("[#{Encodings[encoding]::JA_GRAPH}]",
|
482
|
+
Regexp::MULTILINE,
|
483
|
+
encoding.sub(/ASCII/i, 'none')).match word
|
484
|
+
}.compact.size
|
485
|
+
end
|
486
|
+
|
487
|
+
def split_to_line()
|
488
|
+
# scan(Regexp.new(".*?#{eol_char}|.+",
|
489
|
+
# Regexp::MULTILINE,
|
490
|
+
# encoding.sub(/ASCII/i, 'none'))
|
491
|
+
# )
|
492
|
+
raise "Encodings[encoding] is #{Encodings[encoding].inspect}: encoding not specified or auto-detection failed." unless Encodings[encoding]
|
493
|
+
raise "EOLChars[eol] is #{EOLChars[eol].inspect}: eol not specified or auto-detection failed." unless EOLChars[eol]
|
494
|
+
if defined? eol_char
|
495
|
+
scan(Regexp.new(".*?#{eol_char}|.+",
|
496
|
+
Regexp::MULTILINE,
|
497
|
+
encoding.sub(/ASCII/i, 'none'))
|
498
|
+
)
|
499
|
+
else
|
500
|
+
scan(Regexp.new(".+",
|
501
|
+
Regexp::MULTILINE,
|
502
|
+
encoding.sub(/ASCII/i, 'none'))
|
503
|
+
)
|
504
|
+
end
|
505
|
+
end
|
506
|
+
|
507
|
+
def count_graph_line()
|
508
|
+
split_to_line.collect{|line|
|
509
|
+
line if Regexp.new("[#{Encodings[encoding]::GRAPH}" +
|
510
|
+
"#{Encodings[encoding]::JA_GRAPH}]",
|
511
|
+
Regexp::MULTILINE,
|
512
|
+
encoding.sub(/ASCII/, 'none')).match line
|
513
|
+
}.compact.size
|
514
|
+
end
|
515
|
+
|
516
|
+
def count_blank_line()
|
517
|
+
split_to_line.collect{|line|
|
518
|
+
line if Regexp.new("^[#{Encodings[encoding]::BLANK}" +
|
519
|
+
"#{Encodings[encoding]::JA_BLANK}]+(?:#{eol_char})?",
|
520
|
+
Regexp::MULTILINE,
|
521
|
+
encoding.sub(/ASCII/, 'none')).match line
|
522
|
+
}.compact.size
|
523
|
+
end
|
524
|
+
|
525
|
+
# load encoding modules
|
526
|
+
require 'docdiff/encoding/en_ascii'
|
527
|
+
require 'docdiff/encoding/ja_eucjp'
|
528
|
+
require 'docdiff/encoding/ja_sjis'
|
529
|
+
require 'docdiff/encoding/ja_utf8'
|
530
|
+
end # end ruby_m17n?
|
531
|
+
alias to_bytes split_to_byte
|
532
|
+
alias to_chars split_to_char
|
533
|
+
alias to_words split_to_word
|
534
|
+
alias to_lines split_to_line
|
535
|
+
|
536
|
+
module CR
|
537
|
+
EOL = 'CR'
|
538
|
+
|
539
|
+
def eol_char()
|
540
|
+
"\r"
|
541
|
+
end
|
542
|
+
|
543
|
+
CharString.register_eol(self)
|
544
|
+
end
|
545
|
+
|
546
|
+
module LF
|
547
|
+
EOL = 'LF'
|
548
|
+
|
549
|
+
def eol_char()
|
550
|
+
"\n"
|
551
|
+
end
|
552
|
+
|
553
|
+
CharString.register_eol(self)
|
554
|
+
end
|
555
|
+
|
556
|
+
module CRLF
|
557
|
+
EOL = 'CRLF'
|
558
|
+
|
559
|
+
def eol_char()
|
560
|
+
"\r\n"
|
561
|
+
end
|
562
|
+
|
563
|
+
CharString.register_eol(self)
|
564
|
+
end
|
565
|
+
|
566
|
+
module NoEOL
|
567
|
+
EOL = 'NONE'
|
568
|
+
def eol_char()
|
569
|
+
nil
|
570
|
+
end
|
571
|
+
|
572
|
+
CharString.register_eol(self)
|
573
|
+
end
|
574
|
+
|
575
|
+
end # module CharString
|
576
|
+
|
577
|
+
# class String
|
578
|
+
# include CharString
|
579
|
+
# end
|