docdiff 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +6 -0
- data/.travis.yml +7 -0
- data/Gemfile +17 -0
- data/Guardfile +8 -0
- data/Makefile +108 -0
- data/Rakefile +17 -0
- data/bin/docdiff +179 -0
- data/devutil/JIS0208.TXT +6952 -0
- data/devutil/char_by_charclass.rb +23 -0
- data/devutil/charclass_by_char.rb +21 -0
- data/devutil/jis0208.rb +343 -0
- data/devutil/testjis0208.rb +38 -0
- data/docdiff.conf.example +22 -0
- data/docdiff.gemspec +23 -0
- data/docdiffwebui.cgi +176 -0
- data/docdiffwebui.html +123 -0
- data/img/docdiff-screenshot-format-html-digest-firefox.png +0 -0
- data/img/docdiff-screenshot-format-html-firefox.png +0 -0
- data/img/docdiff-screenshot-format-tty-cmdexe-en.png +0 -0
- data/img/docdiff-screenshot-format-tty-cmdexe-ja.png +0 -0
- data/img/docdiff-screenshot-format-tty-rxvtunicode-en.png +0 -0
- data/img/docdiff-screenshot-format-tty-rxvtunicode-ja.png +0 -0
- data/img/docdiff-screenshot-format-tty-xterm-en.png +0 -0
- data/img/docdiff-screenshot-format-tty-xterm-ja.png +0 -0
- data/img/docdiff-screenshot-resolution-linewordchar-xterm.png +0 -0
- data/index.html +181 -0
- data/langfilter.rb +14 -0
- data/lib/doc_diff.rb +170 -0
- data/lib/docdiff.rb +7 -0
- data/lib/docdiff/charstring.rb +579 -0
- data/lib/docdiff/diff.rb +217 -0
- data/lib/docdiff/diff/contours.rb +382 -0
- data/lib/docdiff/diff/editscript.rb +148 -0
- data/lib/docdiff/diff/rcsdiff.rb +107 -0
- data/lib/docdiff/diff/shortestpath.rb +93 -0
- data/lib/docdiff/diff/speculative.rb +40 -0
- data/lib/docdiff/diff/subsequence.rb +39 -0
- data/lib/docdiff/diff/unidiff.rb +124 -0
- data/lib/docdiff/difference.rb +92 -0
- data/lib/docdiff/document.rb +127 -0
- data/lib/docdiff/encoding/en_ascii.rb +97 -0
- data/lib/docdiff/encoding/ja_eucjp.rb +269 -0
- data/lib/docdiff/encoding/ja_sjis.rb +260 -0
- data/lib/docdiff/encoding/ja_utf8.rb +6974 -0
- data/lib/docdiff/version.rb +3 -0
- data/lib/docdiff/view.rb +476 -0
- data/lib/viewdiff.rb +375 -0
- data/readme.html +713 -0
- data/sample/01.en.ascii.cr +1 -0
- data/sample/01.en.ascii.crlf +2 -0
- data/sample/01.en.ascii.lf +2 -0
- data/sample/01.ja.eucjp.lf +2 -0
- data/sample/01.ja.sjis.cr +1 -0
- data/sample/01.ja.sjis.crlf +2 -0
- data/sample/01.ja.utf8.crlf +2 -0
- data/sample/02.en.ascii.cr +1 -0
- data/sample/02.en.ascii.crlf +2 -0
- data/sample/02.en.ascii.lf +2 -0
- data/sample/02.ja.eucjp.lf +2 -0
- data/sample/02.ja.sjis.cr +1 -0
- data/sample/02.ja.sjis.crlf +2 -0
- data/sample/02.ja.utf8.crlf +2 -0
- data/sample/humpty_dumpty01.ascii.lf +4 -0
- data/sample/humpty_dumpty02.ascii.lf +4 -0
- data/test/charstring_test.rb +1008 -0
- data/test/diff_test.rb +36 -0
- data/test/difference_test.rb +64 -0
- data/test/docdiff_test.rb +193 -0
- data/test/document_test.rb +626 -0
- data/test/test_helper.rb +7 -0
- data/test/view_test.rb +570 -0
- data/test/viewdiff_test.rb +908 -0
- metadata +129 -0
data/langfilter.rb
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
# language filter
|
3
|
+
# usage: langfilter.rb --en <infile >outfile
|
4
|
+
|
5
|
+
def ruby_m17n?
|
6
|
+
return true if "".respond_to? :encoding
|
7
|
+
end
|
8
|
+
|
9
|
+
lang_to_include = ARGV.shift.gsub(/-+/, "")
|
10
|
+
lang_to_exclude = {"en"=>"ja", "ja"=>"en"}[lang_to_include]
|
11
|
+
re = /<([a-z]+) +(?:(?:lang|title)="#{lang_to_exclude}").*?>.*?<\/\1>[\r\n]?/m
|
12
|
+
|
13
|
+
ARGF.set_encoding("UTF-8") if ruby_m17n?
|
14
|
+
ARGF.read.gsub(re, "").display
|
data/lib/doc_diff.rb
ADDED
@@ -0,0 +1,170 @@
|
|
1
|
+
# DocDiff: word/character-oriented text comparison utility
|
2
|
+
# Copyright (C) 2002-2011 Hisashi MORITA
|
3
|
+
# Requirements: Ruby (>= 1.8)
|
4
|
+
class DocDiff
|
5
|
+
|
6
|
+
AppVersion = Docdiff::VERSION
|
7
|
+
Author = "Copyright (C) 2002-2011 Hisashi MORITA.\n" +
|
8
|
+
"diff library originates from Ruby/CVS by TANAKA Akira.\n"
|
9
|
+
License = "This software is licensed under so-called modified BSD license.\n" +
|
10
|
+
"See the document for detail.\n"
|
11
|
+
SystemConfigFileName = File.join(File::Separator, "etc", "docdiff", "docdiff.conf")
|
12
|
+
UserConfigFileName = File.join(ENV['HOME'], "etc", "docdiff", "docdiff.conf")
|
13
|
+
AltUserConfigFileName = File.join(ENV['HOME'], ".docdiff", "docdiff.conf")
|
14
|
+
|
15
|
+
def initialize()
|
16
|
+
@config = {}
|
17
|
+
end
|
18
|
+
attr_accessor :config
|
19
|
+
|
20
|
+
def DocDiff.parse_config_file_content(content)
|
21
|
+
result = {}
|
22
|
+
return result if content.size <= 0
|
23
|
+
lines = content.dup.split(/\r\n|\r|\n/).compact
|
24
|
+
lines.collect!{|line| line.sub(/#.*$/, '')}
|
25
|
+
lines.collect!{|line| line.strip}
|
26
|
+
lines.delete_if{|line| line == ""}
|
27
|
+
lines.each{|line|
|
28
|
+
raise 'line does not include " = ".' unless /[\s]+=[\s]+/.match line
|
29
|
+
name_src, value_src = line.split(/[\s]+=[\s]+/)
|
30
|
+
raise "Invalid name: #{name_src.inspect}" if (/\s/.match name_src)
|
31
|
+
raise "Invalid value: #{value_src.inspect}" unless value_src.kind_of?(String)
|
32
|
+
name = name_src.intern
|
33
|
+
value = value_src
|
34
|
+
value = true if ['on','yes','true'].include? value_src.downcase
|
35
|
+
value = false if ['off','no','false'].include? value_src.downcase
|
36
|
+
value = value_src.to_i if /^[0-9]+$/.match value_src
|
37
|
+
result[name] = value
|
38
|
+
}
|
39
|
+
result
|
40
|
+
end
|
41
|
+
|
42
|
+
def compare_by_line(doc1, doc2)
|
43
|
+
Difference.new(doc1.split_to_line, doc2.split_to_line)
|
44
|
+
end
|
45
|
+
|
46
|
+
def compare_by_line_word(doc1, doc2)
|
47
|
+
lines = compare_by_line(doc1, doc2)
|
48
|
+
words = Difference.new
|
49
|
+
lines.each{|line|
|
50
|
+
if line.first == :change_elt
|
51
|
+
before_change = Document.new(line[1].join,
|
52
|
+
doc1.encoding, doc1.eol)
|
53
|
+
after_change = Document.new(line[2].join,
|
54
|
+
doc2.encoding, doc2.eol)
|
55
|
+
Difference.new(before_change.split_to_word,
|
56
|
+
after_change.split_to_word).each{|word|
|
57
|
+
words << word
|
58
|
+
}
|
59
|
+
else # :common_elt_elt, :del_elt, or :add_elt
|
60
|
+
words << line
|
61
|
+
end
|
62
|
+
}
|
63
|
+
words
|
64
|
+
end
|
65
|
+
|
66
|
+
# i know this implementation of recursion is so lame...
|
67
|
+
def compare_by_line_word_char(doc1, doc2)
|
68
|
+
lines = compare_by_line(doc1, doc2)
|
69
|
+
lines_and_words = Difference.new
|
70
|
+
lines.each{|line|
|
71
|
+
if line.first == :change_elt
|
72
|
+
before_change = Document.new(line[1].join,
|
73
|
+
doc1.encoding, doc1.eol)
|
74
|
+
after_change = Document.new(line[2].join,
|
75
|
+
doc2.encoding, doc2.eol)
|
76
|
+
Difference.new(before_change.split_to_word,
|
77
|
+
after_change.split_to_word).each{|word|
|
78
|
+
lines_and_words << word
|
79
|
+
}
|
80
|
+
else # :common_elt_elt, :del_elt, or :add_elt
|
81
|
+
lines_and_words << line
|
82
|
+
end
|
83
|
+
}
|
84
|
+
lines_words_and_chars = Difference.new
|
85
|
+
lines_and_words.each{|line_or_word|
|
86
|
+
if line_or_word.first == :change_elt
|
87
|
+
before_change = Document.new(line_or_word[1].join, doc1.encoding, doc1.eol)
|
88
|
+
after_change = Document.new(line_or_word[2].join, doc2.encoding, doc2.eol)
|
89
|
+
Difference.new(before_change.split_to_char, after_change.split_to_char).each{|char|
|
90
|
+
lines_words_and_chars << char
|
91
|
+
}
|
92
|
+
else # :common_elt_elt, :del_elt, or :add_elt
|
93
|
+
lines_words_and_chars << line_or_word
|
94
|
+
end
|
95
|
+
}
|
96
|
+
lines_words_and_chars
|
97
|
+
end
|
98
|
+
|
99
|
+
def run(doc1, doc2, option)
|
100
|
+
raise "option is nil" if option.nil?
|
101
|
+
raise "option[:resolution] is nil" if option[:resolution].nil?
|
102
|
+
raise "option[:format] is nil" if option[:format].nil?
|
103
|
+
case
|
104
|
+
when doc1.class == Document && doc2.class == Document # OK
|
105
|
+
when doc1.encoding != nil && doc2.encoding != nil # OK
|
106
|
+
when doc1.encoding == doc2.encoding && doc1.eol == doc2.eol # OK
|
107
|
+
else
|
108
|
+
raise("Error! Blame the author (doc1: #{doc1.encoding}, #{doc1.eol}, doc2: #{doc2.encoding}, #{doc2.eol}).")
|
109
|
+
end
|
110
|
+
|
111
|
+
case option[:resolution]
|
112
|
+
when "line"; then difference = compare_by_line(doc1, doc2)
|
113
|
+
when "word"; then difference = compare_by_line_word(doc1, doc2)
|
114
|
+
when "char"; then difference = compare_by_line_word_char(doc1, doc2)
|
115
|
+
else
|
116
|
+
raise "Unsupported resolution: #{option[:resolution].inspect}"
|
117
|
+
end
|
118
|
+
view = View.new(difference, doc1.encoding, doc1.eol)
|
119
|
+
user_tags = {:start_common => (@config[:tag_common_start] ||= ''),
|
120
|
+
:end_common => (@config[:tag_common_end] ||= ''),
|
121
|
+
:start_del => (@config[:tag_del_start] ||= ''),
|
122
|
+
:end_del => (@config[:tag_del_end] ||= ''),
|
123
|
+
:start_add => (@config[:tag_add_start] ||= ''),
|
124
|
+
:end_add => (@config[:tag_add_end] ||= ''),
|
125
|
+
:start_before_change => (@config[:tag_change_before_start] ||= ''),
|
126
|
+
:end_before_change => (@config[:tag_change_before_end] ||= ''),
|
127
|
+
:start_after_change => (@config[:tag_change_after_start] ||= ''),
|
128
|
+
:end_after_change => (@config[:tag_change_after_end] ||= '')}
|
129
|
+
case option[:digest]
|
130
|
+
when true
|
131
|
+
case option[:format]
|
132
|
+
when "tty"; then result = view.to_tty_digest(option)
|
133
|
+
when "html"; then result = view.to_html_digest(option)
|
134
|
+
when "manued"; then result = view.to_manued_digest(option)
|
135
|
+
when "wdiff"; then result = view.to_wdiff_digest(option)
|
136
|
+
when "stat"; then result = view.to_stat(option)
|
137
|
+
when "user"; then result = view.to_user_digest(user_tags)
|
138
|
+
else
|
139
|
+
raise "Unsupported output format: #{option[:format].inspect}."
|
140
|
+
end
|
141
|
+
when false
|
142
|
+
case option[:format]
|
143
|
+
when "tty"; then result = view.to_tty(option)
|
144
|
+
when "html"; then result = view.to_html(option)
|
145
|
+
when "manued"; then result = view.to_manued(option)
|
146
|
+
when "wdiff"; then result = view.to_wdiff(option)
|
147
|
+
when "stat"; then result = view.to_stat(option)
|
148
|
+
when "user"; then result = view.to_user(user_tags)
|
149
|
+
else
|
150
|
+
raise "Unsupported output format: #{option[:format].inspect}."
|
151
|
+
end
|
152
|
+
end
|
153
|
+
result.join
|
154
|
+
end
|
155
|
+
|
156
|
+
def process_config_file(filename)
|
157
|
+
file_content = nil
|
158
|
+
begin
|
159
|
+
File.open(filename, "r"){|f| file_content = f.read}
|
160
|
+
rescue Errno::ENOENT
|
161
|
+
message = "config file not found so not read."
|
162
|
+
ensure
|
163
|
+
if file_content != nil
|
164
|
+
self.config.update(DocDiff.parse_config_file_content(file_content))
|
165
|
+
end
|
166
|
+
end
|
167
|
+
message
|
168
|
+
end
|
169
|
+
|
170
|
+
end # class DocDiff
|
data/lib/docdiff.rb
ADDED
@@ -0,0 +1,579 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
# Character String module.
|
3
|
+
# To use, include to String, or extend String.
|
4
|
+
# 2003- Hisashi MORITA
|
5
|
+
|
6
|
+
module CharString
|
7
|
+
|
8
|
+
Encodings = {}
|
9
|
+
EOLChars = {} # End-of-line characters, such as CR, LF, CRLF.
|
10
|
+
|
11
|
+
def initialize(string)
|
12
|
+
=begin unnecessary
|
13
|
+
# @encoding = CharString.guess_encoding(string)
|
14
|
+
# @eol = CharString.guess_eol(string)
|
15
|
+
=end unnecessary
|
16
|
+
super
|
17
|
+
end
|
18
|
+
|
19
|
+
def eol()
|
20
|
+
@eol
|
21
|
+
# if @eol
|
22
|
+
# @eol
|
23
|
+
# else
|
24
|
+
# @eol = CharString.guess_eol(self)
|
25
|
+
# # raise "eol is not set.\n"
|
26
|
+
# end
|
27
|
+
end
|
28
|
+
|
29
|
+
def eol=(e)
|
30
|
+
@eol = e
|
31
|
+
extend EOLChars[@eol]
|
32
|
+
end
|
33
|
+
|
34
|
+
def eol_char()
|
35
|
+
if @eol_char
|
36
|
+
@eol_char
|
37
|
+
else
|
38
|
+
nil
|
39
|
+
# extend EOLChars[eol]
|
40
|
+
# eol_char
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def debug()
|
45
|
+
case
|
46
|
+
when @encoding == nil
|
47
|
+
raise "@encoding is nil."
|
48
|
+
when Encodings[@encoding] == nil
|
49
|
+
raise "Encodings[@encoding(=#{@encoding})] is nil."
|
50
|
+
when Encodings[@encoding].class != Module
|
51
|
+
raise "Encodings[@encoding].class(=#{Encodings[@encoding].class}) is not a module."
|
52
|
+
when @eol == nil
|
53
|
+
raise "@eol is nil."
|
54
|
+
when EOLChars[@eol] == nil
|
55
|
+
raise "EOLChars[@eol(=#{@eol})] is nil."
|
56
|
+
else
|
57
|
+
# should I do some alert?
|
58
|
+
end
|
59
|
+
["id: #{self.id}, class: #{self.class}, self: #{self}, ",
|
60
|
+
"module: #{Encodings[@encoding]}, #{EOLChars[@eol]}"].join
|
61
|
+
end
|
62
|
+
|
63
|
+
def CharString.register_encoding(mod)
|
64
|
+
Encodings[mod::Encoding] = mod
|
65
|
+
end
|
66
|
+
|
67
|
+
def CharString.register_eol(mod)
|
68
|
+
EOLChars[mod::EOL] = mod
|
69
|
+
end
|
70
|
+
|
71
|
+
def CharString.guess_eol(string)
|
72
|
+
# returns 'CR', 'LF', 'CRLF', 'UNKNOWN'(binary),
|
73
|
+
# 'NONE'(1-line), or nil
|
74
|
+
return nil if string == nil #=> nil (argument missing)
|
75
|
+
eol_counts = {'CR' => string.scan(/(\r)(?!\n)/o).size,
|
76
|
+
'LF' => string.scan(/(?:\A|[^\r])(\n)/o).size,
|
77
|
+
'CRLF' => string.scan(/(\r\n)/o).size}
|
78
|
+
eol_counts.delete_if{|eol, count| count == 0} # Remove missing EOL
|
79
|
+
eols = eol_counts.keys
|
80
|
+
eol_variety = eols.size # numbers of flavors found
|
81
|
+
if eol_variety == 1 # Only one type of EOL found
|
82
|
+
return eols[0] #=> 'CR', 'LF', or 'CRLF'
|
83
|
+
elsif eol_variety == 0 # No EOL found
|
84
|
+
return 'NONE' #=> 'NONE' (might be 1-line file)
|
85
|
+
else # Multiple types of EOL found
|
86
|
+
return 'UNKNOWN' #=> 'UNKNOWN' (might be binary data)
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
def CharString.ruby_m17n?
|
91
|
+
"".respond_to?(:force_encoding)
|
92
|
+
end
|
93
|
+
|
94
|
+
# Note that some languages (like Japanese) do not have 'word' or 'phrase',
|
95
|
+
# thus some of the following methods are not 'linguistically correct'.
|
96
|
+
|
97
|
+
def count_byte()
|
98
|
+
split_to_byte().size
|
99
|
+
end
|
100
|
+
|
101
|
+
def count_char() # eol = 1 char
|
102
|
+
split_to_char().size
|
103
|
+
end
|
104
|
+
|
105
|
+
def count_graph_char()
|
106
|
+
count_latin_graph_char() + count_ja_graph_char()
|
107
|
+
end
|
108
|
+
|
109
|
+
def count_blank_char()
|
110
|
+
count_latin_blank_char() + count_ja_blank_char()
|
111
|
+
end
|
112
|
+
|
113
|
+
def count_word()
|
114
|
+
split_to_word().size
|
115
|
+
end
|
116
|
+
|
117
|
+
def count_valid_word()
|
118
|
+
count_latin_valid_word() + count_ja_valid_word()
|
119
|
+
end
|
120
|
+
|
121
|
+
def count_line() # this is common to all encodings.
|
122
|
+
split_to_line.size
|
123
|
+
end
|
124
|
+
|
125
|
+
def count_empty_line()
|
126
|
+
split_to_line.collect{|line|
|
127
|
+
line if /^(?:#{eol_char})|^$/m.match line
|
128
|
+
}.compact.size
|
129
|
+
end
|
130
|
+
|
131
|
+
if ruby_m17n?
|
132
|
+
# for Ruby-1.9
|
133
|
+
def encoding()
|
134
|
+
String.new(self).encoding.to_s
|
135
|
+
end
|
136
|
+
|
137
|
+
def encoding=(cs)
|
138
|
+
force_encoding(cs) if self
|
139
|
+
end
|
140
|
+
|
141
|
+
def CharString.guess_encoding(string)
|
142
|
+
if string
|
143
|
+
string.encoding.to_s
|
144
|
+
else
|
145
|
+
nil
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
def split_to_byte()
|
150
|
+
encode("ASCII-8BIT").scan(/./nm)
|
151
|
+
end
|
152
|
+
|
153
|
+
def split_to_char()
|
154
|
+
if eol_char # sometimes string has no end-of-line char
|
155
|
+
encode('UTF-8').scan(Regexp.new("(?:#{eol_char})|(?:.)",
|
156
|
+
Regexp::MULTILINE)
|
157
|
+
).map{|e| e.encode(self.encoding)}
|
158
|
+
else # it seems that no EOL module was extended...
|
159
|
+
encode('UTF-8').scan(Regexp.new("(?:.)",
|
160
|
+
Regexp::MULTILINE)
|
161
|
+
).map{|e| e.encode(self.encoding)}
|
162
|
+
end
|
163
|
+
end
|
164
|
+
|
165
|
+
def count_latin_graph_char()
|
166
|
+
encode('UTF-8').scan(Regexp.new("[#{Encodings['UTF-8']::GRAPH}]",
|
167
|
+
Regexp::MULTILINE)
|
168
|
+
).size
|
169
|
+
end
|
170
|
+
|
171
|
+
def count_ja_graph_char()
|
172
|
+
encode('UTF-8').scan(Regexp.new("[#{Encodings['UTF-8']::JA_GRAPH}]",
|
173
|
+
Regexp::MULTILINE)
|
174
|
+
).size
|
175
|
+
end
|
176
|
+
|
177
|
+
def count_latin_blank_char()
|
178
|
+
encode('UTF-8').scan(Regexp.new("[#{Encodings['UTF-8']::BLANK}]",
|
179
|
+
Regexp::MULTILINE)
|
180
|
+
).size
|
181
|
+
end
|
182
|
+
|
183
|
+
def count_ja_blank_char()
|
184
|
+
encode('UTF-8').scan(Regexp.new("[#{Encodings['UTF-8']::JA_BLANK}]",
|
185
|
+
Regexp::MULTILINE)
|
186
|
+
).size
|
187
|
+
end
|
188
|
+
|
189
|
+
def split_to_word()
|
190
|
+
encode('UTF-8').scan(Regexp.new(Encodings['UTF-8']::WORD_REGEXP_SRC,
|
191
|
+
Regexp::MULTILINE)
|
192
|
+
).map{|e| e.encode(self.encoding)}
|
193
|
+
end
|
194
|
+
|
195
|
+
def count_latin_word()
|
196
|
+
split_to_word.collect{|word|
|
197
|
+
word if Regexp.new("[#{Encodings['UTF-8']::PRINT}]",
|
198
|
+
Regexp::MULTILINE).match word.encode('UTF-8')
|
199
|
+
}.compact.size
|
200
|
+
end
|
201
|
+
|
202
|
+
def count_ja_word()
|
203
|
+
split_to_word.collect{|word|
|
204
|
+
word if Regexp.new("[#{Encodings['UTF-8']::JA_PRINT}]",
|
205
|
+
Regexp::MULTILINE).match word.encode('UTF-8')
|
206
|
+
}.compact.size
|
207
|
+
end
|
208
|
+
|
209
|
+
def count_latin_valid_word()
|
210
|
+
split_to_word.collect{|word|
|
211
|
+
word if Regexp.new("[#{Encodings['UTF-8']::ALNUM}]",
|
212
|
+
Regexp::MULTILINE).match word.encode('UTF-8')
|
213
|
+
}.compact.size
|
214
|
+
end
|
215
|
+
|
216
|
+
def count_ja_valid_word()
|
217
|
+
split_to_word.collect{|word|
|
218
|
+
word if Regexp.new("[#{Encodings['UTF-8']::JA_GRAPH}]",
|
219
|
+
Regexp::MULTILINE).match word.encode('UTF-8')
|
220
|
+
}.compact.size
|
221
|
+
end
|
222
|
+
|
223
|
+
def split_to_line()
|
224
|
+
raise "EOLChars[eol] is #{EOLChars[eol].inspect}: eol not specified or auto-detection failed." unless EOLChars[eol]
|
225
|
+
if defined? eol_char
|
226
|
+
encode('UTF-8').scan(Regexp.new(".*?#{eol_char}|.+",
|
227
|
+
Regexp::MULTILINE)
|
228
|
+
).map{|e| e.encode(self.encoding)}
|
229
|
+
else
|
230
|
+
encode('UTF-8').scan(Regexp.new(".+",
|
231
|
+
Regexp::MULTILINE)
|
232
|
+
).map{|e| e.encode(self.encoding)}
|
233
|
+
end
|
234
|
+
end
|
235
|
+
|
236
|
+
def count_graph_line()
|
237
|
+
split_to_line.collect{|line|
|
238
|
+
line if Regexp.new("[#{Encodings['UTF-8']::GRAPH}" +
|
239
|
+
"#{Encodings['UTF-8']::JA_GRAPH}]",
|
240
|
+
Regexp::MULTILINE).match line.encode('UTF-8')
|
241
|
+
}.compact.size
|
242
|
+
end
|
243
|
+
|
244
|
+
def count_blank_line()
|
245
|
+
split_to_line.collect{|line|
|
246
|
+
line if Regexp.new("^[#{Encodings['UTF-8']::BLANK}" +
|
247
|
+
"#{Encodings['UTF-8']::JA_BLANK}]+(?:#{eol_char})?",
|
248
|
+
Regexp::MULTILINE).match line.encode('UTF-8')
|
249
|
+
}.compact.size
|
250
|
+
end
|
251
|
+
|
252
|
+
# load encoding modules
|
253
|
+
require 'docdiff/encoding/en_ascii'
|
254
|
+
require 'docdiff/encoding/ja_eucjp'
|
255
|
+
require 'docdiff/encoding/ja_sjis'
|
256
|
+
require 'docdiff/encoding/ja_utf8'
|
257
|
+
else
|
258
|
+
# for Ruby-1.8
|
259
|
+
require 'iconv'
|
260
|
+
|
261
|
+
def encoding()
|
262
|
+
@encoding
|
263
|
+
# if @encoding
|
264
|
+
# @encoding
|
265
|
+
# else
|
266
|
+
# @encoding = CharString.guess_encoding(self)
|
267
|
+
# # raise "encoding is not set.\n"
|
268
|
+
# end
|
269
|
+
end
|
270
|
+
|
271
|
+
def encoding=(cs)
|
272
|
+
@encoding = cs
|
273
|
+
extend Encodings[@encoding] # ; p "Hey, I extended #{Encodings[@encoding]}!"
|
274
|
+
end
|
275
|
+
|
276
|
+
# returns nil, 'US-ASCII', 'JIS', 'EUC-JP', 'Shift_JIS', 'UTF-8', or 'UNKNOWN'
|
277
|
+
def CharString.guess_encoding(string)
|
278
|
+
return nil if string == nil
|
279
|
+
result_using_pureruby = CharString.guess_encoding_using_pureruby(string)
|
280
|
+
result_using_iconv = CharString.guess_encoding_using_iconv(string)
|
281
|
+
if result_using_pureruby == result_using_iconv
|
282
|
+
result_using_pureruby
|
283
|
+
else
|
284
|
+
"UNKNOWN"
|
285
|
+
end
|
286
|
+
end
|
287
|
+
|
288
|
+
# returns nil, 'US-ASCII', 'JIS', 'EUC-JP', 'Shift_JIS', 'UTF-8', or 'UNKNOWN'
|
289
|
+
def CharString.guess_encoding_using_pureruby(string)
|
290
|
+
return nil if string == nil
|
291
|
+
|
292
|
+
ascii_pat = '[\x00-\x7f]'
|
293
|
+
jis_pat = ['(?:(?:\x1b\x28\x42)',
|
294
|
+
'|(?:\x1b\x28\x4a)',
|
295
|
+
'|(?:\x1b\x28\x49)',
|
296
|
+
'|(?:\x1b\x24\x40)',
|
297
|
+
'|(?:\x1b\x24\x42)',
|
298
|
+
'|(?:\x1b\x24\x44))'].join
|
299
|
+
eucjp_pat = ['(?:(?:[\x00-\x1f\x7f])',
|
300
|
+
'|(?:[\x20-\x7e])',
|
301
|
+
'|(?:\x8e[\xa1-\xdf])',
|
302
|
+
'|(?:[\xa1-\xfe][\xa1-\xfe])',
|
303
|
+
'|(?:\x8f[\xa1-\xfe][\xa1-\xfe]))'].join
|
304
|
+
sjis_pat = ['(?:(?:[\x00-\x1f\x7f])',
|
305
|
+
'|(?:[\x20-\x7e])',
|
306
|
+
'|(?:[\xa1-\xdf])',
|
307
|
+
'|(?:[\x81-\x9f][\x40-\x7e])',
|
308
|
+
'|(?:[\xe0-\xef][\x80-\xfc]))'].join
|
309
|
+
utf8_pat = ['(?:(?:[\x00-\x7f])',
|
310
|
+
'|(?:[\xc0-\xdf][\x80-\xbf])',
|
311
|
+
'|(?:[\xe0-\xef][\x80-\xbf][\x80-\xbf])',
|
312
|
+
'|(?:[\xf0-\xf7][\x80-\xbf][\x80-\xbf][\x80-\xbf]))'].join
|
313
|
+
|
314
|
+
ascii_match_length = string.scan(/#{ascii_pat}/on).join.length
|
315
|
+
jis_escseq_count = string.scan(/#{jis_pat}/on).size
|
316
|
+
eucjp_match_length = string.scan(/#{eucjp_pat}/no).join.length
|
317
|
+
sjis_match_length = string.scan(/#{sjis_pat}/no).join.length
|
318
|
+
utf8_match_length = string.scan(/#{utf8_pat}/no).join.length
|
319
|
+
|
320
|
+
case
|
321
|
+
when 0 < jis_escseq_count # JIS escape sequense found
|
322
|
+
guessed_encoding = 'JIS'
|
323
|
+
when ascii_match_length == string.length # every char is ASCII (but not JIS)
|
324
|
+
guessed_encoding = 'US-ASCII'
|
325
|
+
else
|
326
|
+
case
|
327
|
+
when eucjp_match_length < (string.length / 2) &&
|
328
|
+
sjis_match_length < (string.length / 2) &&
|
329
|
+
utf8_match_length < (string.length / 2)
|
330
|
+
guessed_encoding = 'UNKNOWN' # either encoding did not match long enough
|
331
|
+
when (eucjp_match_length < utf8_match_length) &&
|
332
|
+
(sjis_match_length < utf8_match_length)
|
333
|
+
guessed_encoding = 'UTF-8'
|
334
|
+
when (eucjp_match_length < sjis_match_length) &&
|
335
|
+
(utf8_match_length < sjis_match_length)
|
336
|
+
guessed_encoding = 'Shift_JIS'
|
337
|
+
when (sjis_match_length < eucjp_match_length) &&
|
338
|
+
(utf8_match_length < eucjp_match_length)
|
339
|
+
guessed_encoding = 'EUC-JP'
|
340
|
+
else
|
341
|
+
guessed_encoding = 'UNKNOWN' # cannot guess at all
|
342
|
+
end
|
343
|
+
end
|
344
|
+
return guessed_encoding
|
345
|
+
end
|
346
|
+
|
347
|
+
def CharString.guess_encoding_using_iconv(string)
|
348
|
+
valid_as_utf8 = CharString.valid_as("utf-8", string)
|
349
|
+
valid_as_sjis = CharString.valid_as("cp932", string) # not sjis, but cp932
|
350
|
+
valid_as_jis = CharString.valid_as("iso-2022-jp", string)
|
351
|
+
valid_as_eucjp = CharString.valid_as("eucjp", string)
|
352
|
+
valid_as_ascii = CharString.valid_as("ascii", string)
|
353
|
+
invalid_as_utf8 = CharString.invalid_as("utf-8", string)
|
354
|
+
invalid_as_sjis = CharString.invalid_as("cp932", string) # not sjis, but cp932
|
355
|
+
invalid_as_jis = CharString.invalid_as("iso-2022-jp", string)
|
356
|
+
invalid_as_eucjp = CharString.invalid_as("eucjp", string)
|
357
|
+
invalid_as_ascii = CharString.invalid_as("ascii", string)
|
358
|
+
case
|
359
|
+
when string == nil
|
360
|
+
nil
|
361
|
+
when valid_as_ascii
|
362
|
+
"US-ASCII"
|
363
|
+
when valid_as_jis # Iconv sometimes recognizes JIS for ASCII, ignoring JIS escape sequence.
|
364
|
+
"JIS"
|
365
|
+
when valid_as_eucjp
|
366
|
+
"EUC-JP"
|
367
|
+
when valid_as_sjis && invalid_as_utf8 && invalid_as_eucjp && invalid_as_jis
|
368
|
+
"Shift_JIS"
|
369
|
+
when valid_as_utf8 && invalid_as_sjis && invalid_as_eucjp && invalid_as_jis
|
370
|
+
"UTF-8"
|
371
|
+
else
|
372
|
+
"UNKNOWN"
|
373
|
+
end
|
374
|
+
end
|
375
|
+
|
376
|
+
def CharString.valid_as(encoding_name, string)
|
377
|
+
begin
|
378
|
+
Iconv.iconv(encoding_name, encoding_name, string)
|
379
|
+
rescue Iconv::IllegalSequence, Iconv::InvalidCharacter, Iconv::OutOfRange
|
380
|
+
return false
|
381
|
+
else
|
382
|
+
return true
|
383
|
+
end
|
384
|
+
end
|
385
|
+
|
386
|
+
def CharString.invalid_as(encoding_name, string)
|
387
|
+
if CharString.valid_as(encoding_name, string)
|
388
|
+
false
|
389
|
+
else
|
390
|
+
true
|
391
|
+
end
|
392
|
+
end
|
393
|
+
|
394
|
+
def split_to_byte()
|
395
|
+
scan(/./nm)
|
396
|
+
end
|
397
|
+
|
398
|
+
def split_to_char()
|
399
|
+
raise "Encodings[encoding] is #{Encodings[encoding].inspect}: encoding not specified or auto-detection failed." unless Encodings[encoding]
|
400
|
+
# raise "EOLChars[eol] is #{EOLChars[eol].inspect}: eol not specified or auto-detection failed." unless EOLChars[eol]
|
401
|
+
if eol_char # sometimes string has no end-of-line char
|
402
|
+
scan(Regexp.new("(?:#{eol_char})|(?:.)",
|
403
|
+
Regexp::MULTILINE,
|
404
|
+
encoding.sub(/ASCII/i, 'none'))
|
405
|
+
)
|
406
|
+
else # it seems that no EOL module was extended...
|
407
|
+
scan(Regexp.new("(?:.)",
|
408
|
+
Regexp::MULTILINE,
|
409
|
+
encoding.sub(/ASCII/i, 'none'))
|
410
|
+
)
|
411
|
+
end
|
412
|
+
end
|
413
|
+
|
414
|
+
def count_latin_graph_char()
|
415
|
+
raise "Encodings[encoding] is #{Encodings[encoding].inspect}: encoding not specified or auto-detection failed." unless Encodings[encoding]
|
416
|
+
# raise "EOLChars[eol] is #{EOLChars[eol].inspect}: eol not specified or auto-detection failed." unless EOLChars[eol]
|
417
|
+
scan(Regexp.new("[#{Encodings[encoding]::GRAPH}]",
|
418
|
+
Regexp::MULTILINE,
|
419
|
+
encoding.sub(/ASCII/i, 'none'))
|
420
|
+
).size
|
421
|
+
end
|
422
|
+
|
423
|
+
def count_ja_graph_char()
|
424
|
+
raise "Encodings[encoding] is #{Encodings[encoding].inspect}: encoding not specified or auto-detection failed." unless Encodings[encoding]
|
425
|
+
# raise "EOLChars[eol] is #{EOLChars[eol].inspect}: eol not specified or auto-detection failed." unless EOLChars[eol]
|
426
|
+
scan(Regexp.new("[#{Encodings[encoding]::JA_GRAPH}]",
|
427
|
+
Regexp::MULTILINE,
|
428
|
+
encoding.sub(/ASCII/i, 'none'))
|
429
|
+
).size
|
430
|
+
end
|
431
|
+
|
432
|
+
def count_latin_blank_char()
|
433
|
+
scan(Regexp.new("[#{Encodings[encoding]::BLANK}]",
|
434
|
+
Regexp::MULTILINE,
|
435
|
+
encoding.sub(/ASCII/i, 'none'))
|
436
|
+
).size
|
437
|
+
end
|
438
|
+
|
439
|
+
def count_ja_blank_char()
|
440
|
+
scan(Regexp.new("[#{Encodings[encoding]::JA_BLANK}]",
|
441
|
+
Regexp::MULTILINE,
|
442
|
+
encoding.sub(/ASCII/i, 'none'))
|
443
|
+
).size
|
444
|
+
end
|
445
|
+
|
446
|
+
def split_to_word()
|
447
|
+
raise "Encodings[encoding] is #{Encodings[encoding].inspect}: encoding not specified or auto-detection failed." unless Encodings[encoding]
|
448
|
+
# raise "EOLChars[eol] is #{EOLChars[eol].inspect}: eol not specified or auto-detection failed." unless EOLChars[eol]
|
449
|
+
scan(Regexp.new(Encodings[encoding]::WORD_REGEXP_SRC,
|
450
|
+
Regexp::MULTILINE,
|
451
|
+
encoding.sub(/ASCII/i, 'none'))
|
452
|
+
)
|
453
|
+
end
|
454
|
+
|
455
|
+
def count_latin_word()
|
456
|
+
split_to_word.collect{|word|
|
457
|
+
word if Regexp.new("[#{Encodings[encoding]::PRINT}]",
|
458
|
+
Regexp::MULTILINE,
|
459
|
+
encoding.sub(/ASCII/i, 'none')).match word
|
460
|
+
}.compact.size
|
461
|
+
end
|
462
|
+
|
463
|
+
def count_ja_word()
|
464
|
+
split_to_word.collect{|word|
|
465
|
+
word if Regexp.new("[#{Encodings[encoding]::JA_PRINT}]",
|
466
|
+
Regexp::MULTILINE,
|
467
|
+
encoding.sub(/ASCII/i, 'none')).match word
|
468
|
+
}.compact.size
|
469
|
+
end
|
470
|
+
|
471
|
+
def count_latin_valid_word()
|
472
|
+
split_to_word.collect{|word|
|
473
|
+
word if Regexp.new("[#{Encodings[encoding]::ALNUM}]",
|
474
|
+
Regexp::MULTILINE,
|
475
|
+
encoding.sub(/ASCII/i, 'none')).match word
|
476
|
+
}.compact.size
|
477
|
+
end
|
478
|
+
|
479
|
+
def count_ja_valid_word()
|
480
|
+
split_to_word.collect{|word|
|
481
|
+
word if Regexp.new("[#{Encodings[encoding]::JA_GRAPH}]",
|
482
|
+
Regexp::MULTILINE,
|
483
|
+
encoding.sub(/ASCII/i, 'none')).match word
|
484
|
+
}.compact.size
|
485
|
+
end
|
486
|
+
|
487
|
+
def split_to_line()
|
488
|
+
# scan(Regexp.new(".*?#{eol_char}|.+",
|
489
|
+
# Regexp::MULTILINE,
|
490
|
+
# encoding.sub(/ASCII/i, 'none'))
|
491
|
+
# )
|
492
|
+
raise "Encodings[encoding] is #{Encodings[encoding].inspect}: encoding not specified or auto-detection failed." unless Encodings[encoding]
|
493
|
+
raise "EOLChars[eol] is #{EOLChars[eol].inspect}: eol not specified or auto-detection failed." unless EOLChars[eol]
|
494
|
+
if defined? eol_char
|
495
|
+
scan(Regexp.new(".*?#{eol_char}|.+",
|
496
|
+
Regexp::MULTILINE,
|
497
|
+
encoding.sub(/ASCII/i, 'none'))
|
498
|
+
)
|
499
|
+
else
|
500
|
+
scan(Regexp.new(".+",
|
501
|
+
Regexp::MULTILINE,
|
502
|
+
encoding.sub(/ASCII/i, 'none'))
|
503
|
+
)
|
504
|
+
end
|
505
|
+
end
|
506
|
+
|
507
|
+
def count_graph_line()
|
508
|
+
split_to_line.collect{|line|
|
509
|
+
line if Regexp.new("[#{Encodings[encoding]::GRAPH}" +
|
510
|
+
"#{Encodings[encoding]::JA_GRAPH}]",
|
511
|
+
Regexp::MULTILINE,
|
512
|
+
encoding.sub(/ASCII/, 'none')).match line
|
513
|
+
}.compact.size
|
514
|
+
end
|
515
|
+
|
516
|
+
def count_blank_line()
|
517
|
+
split_to_line.collect{|line|
|
518
|
+
line if Regexp.new("^[#{Encodings[encoding]::BLANK}" +
|
519
|
+
"#{Encodings[encoding]::JA_BLANK}]+(?:#{eol_char})?",
|
520
|
+
Regexp::MULTILINE,
|
521
|
+
encoding.sub(/ASCII/, 'none')).match line
|
522
|
+
}.compact.size
|
523
|
+
end
|
524
|
+
|
525
|
+
# load encoding modules
|
526
|
+
require 'docdiff/encoding/en_ascii'
|
527
|
+
require 'docdiff/encoding/ja_eucjp'
|
528
|
+
require 'docdiff/encoding/ja_sjis'
|
529
|
+
require 'docdiff/encoding/ja_utf8'
|
530
|
+
end # end ruby_m17n?
|
531
|
+
alias to_bytes split_to_byte
|
532
|
+
alias to_chars split_to_char
|
533
|
+
alias to_words split_to_word
|
534
|
+
alias to_lines split_to_line
|
535
|
+
|
536
|
+
module CR
|
537
|
+
EOL = 'CR'
|
538
|
+
|
539
|
+
def eol_char()
|
540
|
+
"\r"
|
541
|
+
end
|
542
|
+
|
543
|
+
CharString.register_eol(self)
|
544
|
+
end
|
545
|
+
|
546
|
+
module LF
|
547
|
+
EOL = 'LF'
|
548
|
+
|
549
|
+
def eol_char()
|
550
|
+
"\n"
|
551
|
+
end
|
552
|
+
|
553
|
+
CharString.register_eol(self)
|
554
|
+
end
|
555
|
+
|
556
|
+
module CRLF
|
557
|
+
EOL = 'CRLF'
|
558
|
+
|
559
|
+
def eol_char()
|
560
|
+
"\r\n"
|
561
|
+
end
|
562
|
+
|
563
|
+
CharString.register_eol(self)
|
564
|
+
end
|
565
|
+
|
566
|
+
module NoEOL
|
567
|
+
EOL = 'NONE'
|
568
|
+
def eol_char()
|
569
|
+
nil
|
570
|
+
end
|
571
|
+
|
572
|
+
CharString.register_eol(self)
|
573
|
+
end
|
574
|
+
|
575
|
+
end # module CharString
|
576
|
+
|
577
|
+
# class String
|
578
|
+
# include CharString
|
579
|
+
# end
|