sitediff 0.0.6 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,238 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'fileutils'
4
+ require 'json'
5
+ require 'minitar'
6
+ require 'sitediff'
7
+ require 'sitediff/config'
8
+ require 'zlib'
9
+
10
+ class SiteDiff
11
+ ##
12
+ # SiteDiff Report Helper.
13
+ class Report
14
+ attr_reader :results, :cache
15
+
16
+ ##
17
+ # Directory where diffs will be generated.
18
+ DIFFS_DIR = 'diffs'
19
+
20
+ ##
21
+ # Name of file containing a list of pages with diffs.
22
+ FAILURES_FILE = 'failures.txt'
23
+
24
+ ##
25
+ # Name of file containing HTML report of diffs.
26
+ REPORT_FILE_HTML = 'report.html'
27
+
28
+ ##
29
+ # Name of file containing JSON report of diffs.
30
+ REPORT_FILE_JSON = 'report.json'
31
+
32
+ ##
33
+ # Name of file containing exported file archive.
34
+ REPORT_FILE_TAR = 'report.tgz'
35
+
36
+ ##
37
+ # Name of directory in which to build the portable report.
38
+ REPORT_BUILD_DIR = '_tmp_report'
39
+
40
+ ##
41
+ # Name of the portable report directory.
42
+ REPORT_DIR = 'report'
43
+
44
+ ##
45
+ # Path to settings used for report.
46
+ SETTINGS_FILE = 'settings.yaml'
47
+
48
+ ##
49
+ # Creates a Reporter object.
50
+ #
51
+ # @param [Config] config.
52
+ # @param [Cache] cache.
53
+ # @param [Array] results.
54
+ def initialize(config, cache, results)
55
+ @config = config
56
+ @cache = cache
57
+ @results = results
58
+ end
59
+
60
+ ##
61
+ # Generates an HTML report.
62
+ #
63
+ # @param [String] dir
64
+ # The directory in which the report is to be generated.
65
+ def generate_html(
66
+ dir,
67
+ report_before = nil,
68
+ report_after = nil
69
+ )
70
+ report_before ||= @config.before_url
71
+ report_after ||= @config.after_url
72
+
73
+ dir = SiteDiff.ensure_dir dir
74
+
75
+ write_diffs dir
76
+ write_failures dir
77
+
78
+ # Prepare report.
79
+ report = Diff.generate_html(
80
+ @results,
81
+ report_before,
82
+ report_after,
83
+ @cache,
84
+ @config.export
85
+ )
86
+
87
+ # Write report.
88
+ report_file = dir + REPORT_FILE_HTML
89
+ report_file.unlink if report_file.file?
90
+ report_file.open('w') { |f| f.write(report) }
91
+
92
+ write_settings dir, report_before, report_after
93
+
94
+ if @config.export
95
+ package_report(dir)
96
+ else
97
+ SiteDiff.log 'Report generated to ' + report_file.expand_path.to_s
98
+ end
99
+ end
100
+
101
+ ##
102
+ # Generates a JSON report.
103
+ #
104
+ # @param dir
105
+ # The directory in which the report is to be generated.
106
+ def generate_json(dir)
107
+ dir = SiteDiff.ensure_dir dir
108
+ write_diffs dir
109
+ write_failures dir
110
+
111
+ # Prepare report.
112
+ report = {
113
+ paths_compared: @results.length,
114
+ paths_diffs: 0,
115
+ paths: {}
116
+ }
117
+ @results.each do |item|
118
+ report[:paths_diffs] += 1 unless item.success?
119
+
120
+ item_report = {
121
+ path: item.path,
122
+ status: item.status,
123
+ message: item.error
124
+ }
125
+ report[:paths][item.path] = item_report
126
+ end
127
+ report = JSON report
128
+
129
+ # Write report.
130
+ report_file = dir + REPORT_FILE_JSON
131
+ report_file.unlink if report_file.file?
132
+ report_file.open('w') { |f| f.write(report) }
133
+
134
+ write_settings dir
135
+
136
+ SiteDiff.log 'Report generated to ' + report_file.expand_path.to_s
137
+ end
138
+
139
+ ##
140
+ # Package report for export.
141
+ def package_report(dir)
142
+ # Create temporaryreport directories.
143
+ temp_path = dir + REPORT_BUILD_DIR
144
+ temp_path.rmtree if temp_path.directory?
145
+ temp_path.mkpath
146
+ report_path = temp_path + REPORT_DIR
147
+ report_path.mkpath
148
+ files_path = report_path + 'files'
149
+ files_path.mkpath
150
+ diffs_path = dir + DIFFS_DIR
151
+
152
+ # Move files to place.
153
+ FileUtils.move(dir + REPORT_FILE_HTML, report_path)
154
+ FileUtils.move(diffs_path, files_path) if diffs_path.directory?
155
+
156
+ # Make tar file.
157
+ Dir.chdir(temp_path) do
158
+ Minitar.pack(
159
+ REPORT_DIR,
160
+ Zlib::GzipWriter.new(File.open(REPORT_FILE_TAR, 'wb'))
161
+ )
162
+ end
163
+ FileUtils.move(temp_path + REPORT_FILE_TAR, dir)
164
+ temp_path.rmtree
165
+ SiteDiff.log 'Archived report generated to ' + dir.join(REPORT_FILE_TAR).to_s
166
+ end
167
+
168
+ ##
169
+ # Creates diff files in a directory named "diffs".
170
+ #
171
+ # If "dir" is /foo/bar, then diffs will be placed in /foo/bar/diffs.
172
+ #
173
+ # @param [Pathname] dir
174
+ # The directory in which a "diffs" directory is to be generated.
175
+ def write_diffs(dir)
176
+ raise Exception 'dir must be a Pathname' unless dir.is_a? Pathname
177
+
178
+ # Delete existing "diffs" dir, if exists.
179
+ diff_dir = dir + DIFFS_DIR
180
+ diff_dir.rmtree if diff_dir.exist?
181
+
182
+ # Write diffs to the diff directory.
183
+ @results.each { |r| r.dump(dir, @config.export) if r.status == Result::STATUS_FAILURE }
184
+ SiteDiff.log "All diff files written to #{diff_dir.expand_path}" unless @config.export
185
+ end
186
+
187
+ ##
188
+ # Writes paths with diffs into a file.
189
+ #
190
+ # @param [Pathname] dir
191
+ # The directory in which the report is to be generated.
192
+ def write_failures(dir)
193
+ raise Exception 'dir must be a Pathname' unless dir.is_a? Pathname
194
+
195
+ failures = dir + FAILURES_FILE
196
+ SiteDiff.log "All failures written to #{failures.expand_path}"
197
+ failures.open('w') do |f|
198
+ @results.each { |r| f.puts r.path unless r.success? }
199
+ end
200
+ end
201
+
202
+ ##
203
+ # Creates report settings.yaml file.
204
+ #
205
+ # TODO: Find a way to avoid having to create this file.
206
+ #
207
+ # @param [Pathname] dir
208
+ # The directory in which the report is to be generated.
209
+ def write_settings(dir, report_before = nil, report_after = nil)
210
+ raise Exception 'dir must be a Pathname' unless dir.is_a? Pathname
211
+
212
+ settings = {
213
+ 'before' => report_before,
214
+ 'after' => report_after,
215
+ 'cached' => %w[before after]
216
+ }
217
+ dir.+(SETTINGS_FILE).open('w') { |f| YAML.dump(settings, f) }
218
+ end
219
+
220
+ ##
221
+ # Returns CSS for HTML report.
222
+ def self.css
223
+ output = ''
224
+ output += File.read(File.join(SiteDiff::FILES_DIR, 'normalize.css'))
225
+ output += File.read(File.join(SiteDiff::FILES_DIR, 'sitediff.css'))
226
+ output
227
+ end
228
+
229
+ ##
230
+ # Returns JS for HTML report.
231
+ def self.js
232
+ output = ''
233
+ output += File.read(File.join(SiteDiff::FILES_DIR, 'jquery.min.js'))
234
+ output += File.read(File.join(SiteDiff::FILES_DIR, 'sitediff.js'))
235
+ output
236
+ end
237
+ end
238
+ end
@@ -2,25 +2,42 @@
2
2
 
3
3
  require 'sitediff'
4
4
  require 'sitediff/diff'
5
+ require 'sitediff/report'
5
6
  require 'digest/sha1'
6
7
  require 'fileutils'
7
8
 
8
9
  class SiteDiff
9
- class Result < Struct.new(:path, :before, :after, :before_encoding, :after_encoding, :error, :verbose)
10
+ # SiteDiff Result Object.
11
+ class Result < Struct.new(
12
+ :path,
13
+ :before,
14
+ :after,
15
+ :before_encoding,
16
+ :after_encoding,
17
+ :error,
18
+ :verbose
19
+ )
10
20
  STATUS_SUCCESS = 0 # Identical before and after
11
21
  STATUS_FAILURE = 1 # Different before and after
12
22
  STATUS_ERROR = 2 # Couldn't fetch page
13
- STATUS_TEXT = %w[success failure error].freeze
23
+ STATUS_TEXT = %w[unchanged changed error].freeze
14
24
 
15
25
  attr_reader :status, :diff
16
26
 
27
+ ##
28
+ # Creates a Result.
17
29
  def initialize(*args)
18
30
  super
19
31
  if error
20
32
  @status = STATUS_ERROR
21
33
  else
22
34
  if !before_encoding || !after_encoding
23
- @diff = Diff.binary_diffy(before, after, before_encoding, after_encoding)
35
+ @diff = Diff.binary_diffy(
36
+ before,
37
+ after,
38
+ before_encoding,
39
+ after_encoding
40
+ )
24
41
  else
25
42
  @diff = Diff.html_diffy(before, after)
26
43
  end
@@ -28,10 +45,22 @@ class SiteDiff
28
45
  end
29
46
  end
30
47
 
48
+ ##
49
+ # Whether the result has no diff.
50
+ #
51
+ # If there is a diff, it is not a success.
52
+ #
53
+ # TODO: Change "Success" to unchanged.
31
54
  def success?
32
55
  status == STATUS_SUCCESS
33
56
  end
34
57
 
58
+ ##
59
+ # Whether the result has an error.
60
+ def error?
61
+ status == STATUS_ERROR
62
+ end
63
+
35
64
  # Textual representation of the status
36
65
  def status_text
37
66
  STATUS_TEXT[status]
@@ -45,38 +74,37 @@ class SiteDiff
45
74
 
46
75
  # Filename to store diff
47
76
  def filename
48
- File.join(SiteDiff::DIFFS_DIR, Digest::SHA1.hexdigest(path) + '.html')
77
+ File.join(Report::DIFFS_DIR, Digest::SHA1.hexdigest(path) + '.html')
49
78
  end
50
79
 
51
- # Text of the link in the HTML report
52
- def link
53
- case status
54
- when STATUS_ERROR then error
55
- when STATUS_SUCCESS then status_text
56
- when STATUS_FAILURE then "<a href='#{filename}'>DIFF</a>"
57
- end
80
+ # Returns a URL to the result diff.
81
+ #
82
+ # Returns nil if the result has no diffs.
83
+ def diff_url(relative = false)
84
+ prefix = relative ? 'files/' : '/files/'
85
+ return prefix + filename if status == STATUS_FAILURE
58
86
  end
59
87
 
60
88
  # Log the result to the terminal
61
89
  def log(verbose = true)
62
90
  case status
63
- when STATUS_SUCCESS then
64
- SiteDiff.log path, :diff_success, 'UNCHANGED'
65
- when STATUS_ERROR then
66
- SiteDiff.log path, :warn, "ERROR (#{error})"
67
- when STATUS_FAILURE then
68
- SiteDiff.log path, :diff_failure, 'CHANGED'
91
+ when STATUS_SUCCESS
92
+ SiteDiff.log path, :success, 'UNCHANGED'
93
+ when STATUS_ERROR
94
+ SiteDiff.log path + " (#{error})", :warning, 'ERROR'
95
+ when STATUS_FAILURE
96
+ SiteDiff.log path, :error, 'CHANGED'
69
97
  puts Diff.terminal_diffy(before, after) if verbose
70
98
  end
71
99
  end
72
100
 
73
101
  # Dump the result to a file
74
- def dump(dir)
102
+ def dump(dir, relative = false)
75
103
  dump_path = File.join(dir, filename)
76
104
  base = File.dirname(dump_path)
77
105
  FileUtils.mkdir_p(base) unless File.exist?(base)
78
106
  File.open(dump_path, 'w') do |f|
79
- f.write(Diff.generate_diff_output(self))
107
+ f.write(Diff.generate_diff_output(self, relative))
80
108
  end
81
109
  end
82
110
  end
@@ -8,21 +8,26 @@ require 'nokogiri'
8
8
  require 'set'
9
9
 
10
10
  class SiteDiff
11
+ # SiteDiff Sanitizer.
11
12
  class Sanitizer
12
13
  class InvalidSanitization < SiteDiffException; end
13
14
 
14
15
  TOOLS = {
15
16
  array: %w[dom_transform sanitization],
16
- scalar: %w[selector remove_spacing]
17
+ scalar: %w[selector remove_spacing ignore_whitespace]
17
18
  }.freeze
18
- DOM_TRANSFORMS = Set.new(%w[remove unwrap_root unwrap remove_class])
19
+ DOM_TRANSFORMS = Set.new(%w[remove strip unwrap_root unwrap remove_class])
19
20
 
21
+ ##
22
+ # Creates a Sanitizer.
20
23
  def initialize(html, config, opts = {})
21
24
  @html = html
22
25
  @config = config
23
26
  @opts = opts
24
27
  end
25
28
 
29
+ ##
30
+ # Performs sanitization.
26
31
  def sanitize
27
32
  return '' if @html == '' # Quick return on empty input
28
33
 
@@ -56,13 +61,13 @@ class SiteDiff
56
61
  def canonicalize_rule(name)
57
62
  (rules = @config[name]) || (return nil)
58
63
 
59
- if rules[0]&.respond_to?(:[]) && rules[0]['value']
60
- # Already an array
64
+ # Already an array? Do nothing.
65
+ if rules[0]&.respond_to?('each') && rules[0]&.fetch('value')
66
+ # If it is a hash, put it in an array.
61
67
  elsif rules['value']
62
- # Hash, put it in an array
63
68
  rules = [rules]
69
+ # If it is a scalar value, put it in an array.
64
70
  else
65
- # Scalar, put it in a hash
66
71
  rules = [{ 'value' => rules }]
67
72
  end
68
73
 
@@ -99,7 +104,13 @@ class SiteDiff
99
104
  # Prevent potential UTF-8 encoding errors by removing bytes
100
105
  # Not the only solution. An alternative is to return the
101
106
  # string unmodified.
102
- @html = @html.encode('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '')
107
+ @html = @html.encode(
108
+ 'UTF-8',
109
+ 'binary',
110
+ invalid: :replace,
111
+ undef: :replace,
112
+ replace: ''
113
+ )
103
114
  global.each { |r| r.apply(@html) }
104
115
  end
105
116
 
@@ -151,7 +162,13 @@ class SiteDiff
151
162
  # Prevent potential UTF-8 encoding errors by removing invalid bytes.
152
163
  # Not the only solution.
153
164
  # An alternative is to return the string unmodified.
154
- str = str.encode('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '')
165
+ str = str.encode(
166
+ 'UTF-8',
167
+ 'binary',
168
+ invalid: :replace,
169
+ undef: :replace,
170
+ replace: ''
171
+ )
155
172
  # Remove xml declaration and <html> tags
156
173
  str.sub!(/\A<\?xml.*$\n/, '')
157
174
  str.sub!(/\A^<html>$\n/, '')
@@ -164,6 +181,10 @@ class SiteDiff
164
181
  # Remove blank lines
165
182
  str.gsub!(/^\s*$\n/, '')
166
183
 
184
+ # Remove DOS newlines
185
+ str.gsub!(/\x0D$/, '')
186
+ str.gsub!(/&#13;$/, '')
187
+
167
188
  str
168
189
  end
169
190
 
@@ -11,18 +11,25 @@ class SiteDiff
11
11
  # * { :type => "unwrap", :selector => "div.field-item" }
12
12
  # * { :type => "remove", :selector => "div.extra-stuff" }
13
13
  # * { :type => "remove_class", :class => 'class1' }
14
+ # * { :type => "strip", :selector => 'h1' }
14
15
  class DomTransform
15
- Transforms = {}
16
+ # Supported dom_transform types.
17
+ TRANSFORMS = {}
16
18
 
19
+ ##
20
+ # Creates a DOM Transform.
17
21
  def initialize(rule)
18
22
  @rule = rule
19
23
  end
20
24
 
25
+ ##
21
26
  # Often an array or scalar are both ok values. Turn either into an array.
22
27
  def to_array(val)
23
28
  [val].flatten
24
29
  end
25
30
 
31
+ ##
32
+ # TODO: Document what this method does.
26
33
  def targets(node)
27
34
  selectors = to_array(@rule['selector'])
28
35
  selectors.each do |sel|
@@ -30,42 +37,70 @@ class SiteDiff
30
37
  end
31
38
  end
32
39
 
40
+ ##
41
+ # Applies the transformation to a DOM node.
33
42
  def apply(node)
34
43
  targets(node) { |t| process(t) }
35
44
  end
36
45
 
46
+ ##
47
+ # Registers a DOM Transform plugin.
37
48
  def self.register(name)
38
- Transforms[name] = self
49
+ TRANSFORMS[name] = self
39
50
  end
40
51
 
52
+ ##
53
+ # Creates a DOM Transform as per rule.
41
54
  def self.create(rule)
42
55
  (type = rule['type']) ||
43
56
  raise(InvalidSanitization, 'DOM transform needs a type')
44
- (transform = Transforms[type]) ||
57
+ (transform = TRANSFORMS[type]) ||
45
58
  raise(InvalidSanitization, "No DOM transform named #{type}")
46
59
  transform.new(rule)
47
60
  end
48
61
 
49
- # Remove elements matching 'selector'
62
+ ##
63
+ # Remove elements matching 'selector'.
50
64
  class Remove < DomTransform
51
65
  register 'remove'
66
+
67
+ ##
68
+ # Processes a node.
52
69
  def process(node)
53
70
  node.remove
54
71
  end
55
72
  end
56
73
 
57
- # Unwrap elements matching 'selector'
74
+ # Squeeze whitespace from a tag matching 'selector'.
75
+ class Strip < DomTransform
76
+ register 'strip'
77
+
78
+ ##
79
+ # Processes a node.
80
+ def process(node)
81
+ node.content = node.content.strip
82
+ end
83
+ end
84
+
85
+ # Unwrap elements matching 'selector'.
58
86
  class Unwrap < DomTransform
59
87
  register 'unwrap'
88
+
89
+ ##
90
+ # Processes a node.
60
91
  def process(node)
61
92
  node.add_next_sibling(node.children)
62
93
  node.remove
63
94
  end
64
95
  end
65
96
 
97
+ ##
66
98
  # Remove classes from elements matching selector
67
99
  class RemoveClass < DomTransform
68
100
  register 'remove_class'
101
+
102
+ ##
103
+ # Processes a node.
69
104
  def process(node)
70
105
  classes = to_array(@rule['class'])
71
106
 
@@ -77,9 +112,13 @@ class SiteDiff
77
112
  end
78
113
  end
79
114
 
80
- # Unwrap the root element
115
+ ##
116
+ # Unwrap the root element.
81
117
  class UnwrapRoot < DomTransform
82
118
  register 'unwrap_root'
119
+
120
+ ##
121
+ # Applies the transformation to a DOM node.
83
122
  def apply(node)
84
123
  (node.children.size == 1) ||
85
124
  raise(InvalidSanitization, 'Multiple root elements in unwrap_root')