right_scraper 3.2.6 → 5.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. checksums.yaml +7 -0
  2. data/lib/right_scraper.rb +16 -34
  3. data/lib/right_scraper/builders.rb +32 -0
  4. data/lib/right_scraper/builders/base.rb +19 -20
  5. data/lib/right_scraper/builders/filesystem.rb +8 -6
  6. data/lib/right_scraper/builders/union.rb +4 -1
  7. data/lib/right_scraper/loggers.rb +31 -0
  8. data/lib/right_scraper/loggers/base.rb +113 -0
  9. data/lib/right_scraper/loggers/default.rb +98 -0
  10. data/lib/right_scraper/{scraper.rb → main.rb} +53 -9
  11. data/lib/right_scraper/processes.rb +33 -0
  12. data/lib/right_scraper/processes/shell.rb +227 -0
  13. data/lib/right_scraper/processes/{ssh.rb → ssh_agent.rb} +4 -0
  14. data/lib/right_scraper/processes/svn_client.rb +117 -0
  15. data/lib/right_scraper/processes/warden.rb +358 -0
  16. data/lib/right_scraper/registered_base.rb +154 -0
  17. data/lib/right_scraper/repositories.rb +33 -0
  18. data/lib/right_scraper/repositories/base.rb +271 -232
  19. data/lib/right_scraper/repositories/download.rb +8 -6
  20. data/lib/right_scraper/repositories/git.rb +8 -9
  21. data/lib/right_scraper/repositories/svn.rb +8 -8
  22. data/lib/right_scraper/resources.rb +32 -0
  23. data/lib/right_scraper/resources/base.rb +5 -1
  24. data/lib/right_scraper/resources/cookbook.rb +34 -27
  25. data/lib/right_scraper/resources/workflow.rb +27 -28
  26. data/lib/right_scraper/retrievers.rb +34 -0
  27. data/lib/right_scraper/retrievers/base.rb +80 -84
  28. data/lib/right_scraper/retrievers/checkout_base.rb +178 -0
  29. data/lib/right_scraper/retrievers/download.rb +125 -117
  30. data/lib/right_scraper/retrievers/git.rb +377 -223
  31. data/lib/right_scraper/retrievers/svn.rb +102 -62
  32. data/lib/right_scraper/scanners.rb +37 -0
  33. data/lib/right_scraper/scanners/base.rb +77 -80
  34. data/lib/right_scraper/scanners/cookbook_manifest.rb +31 -30
  35. data/lib/right_scraper/scanners/cookbook_metadata.rb +380 -35
  36. data/lib/right_scraper/scanners/cookbook_s3_upload.rb +56 -53
  37. data/lib/right_scraper/scanners/union.rb +61 -58
  38. data/lib/right_scraper/scanners/workflow_manifest.rb +55 -54
  39. data/lib/right_scraper/scanners/workflow_metadata.rb +41 -39
  40. data/lib/right_scraper/scanners/workflow_s3_upload.rb +59 -55
  41. data/lib/right_scraper/scrapers.rb +32 -0
  42. data/lib/right_scraper/scrapers/base.rb +217 -205
  43. data/lib/right_scraper/scrapers/cookbook.rb +42 -40
  44. data/lib/right_scraper/scrapers/workflow.rb +57 -58
  45. data/lib/right_scraper/version.rb +3 -0
  46. data/right_scraper.gemspec +12 -16
  47. metadata +57 -163
  48. data/Gemfile +0 -15
  49. data/Rakefile +0 -89
  50. data/lib/right_scraper/logger.rb +0 -107
  51. data/lib/right_scraper/loggers/noisy.rb +0 -85
  52. data/lib/right_scraper/repositories/mock.rb +0 -70
  53. data/lib/right_scraper/retrievers/checkout.rb +0 -79
  54. data/lib/right_scraper/scraper_logger.rb +0 -66
  55. data/lib/right_scraper/svn_client.rb +0 -164
  56. data/right_scraper.rconf +0 -13
  57. data/spec/builder_spec.rb +0 -50
  58. data/spec/cookbook_helper.rb +0 -73
  59. data/spec/cookbook_manifest_spec.rb +0 -93
  60. data/spec/cookbook_s3_upload_spec.rb +0 -159
  61. data/spec/download/download_retriever_spec.rb +0 -118
  62. data/spec/download/download_retriever_spec_helper.rb +0 -72
  63. data/spec/download/download_spec.rb +0 -128
  64. data/spec/download/multi_dir_spec.rb +0 -106
  65. data/spec/download/multi_dir_spec_helper.rb +0 -40
  66. data/spec/git/cookbook_spec.rb +0 -165
  67. data/spec/git/demokey +0 -27
  68. data/spec/git/demokey.pub +0 -1
  69. data/spec/git/password_key +0 -30
  70. data/spec/git/password_key.pub +0 -1
  71. data/spec/git/repository_spec.rb +0 -110
  72. data/spec/git/retriever_spec.rb +0 -553
  73. data/spec/git/retriever_spec_helper.rb +0 -112
  74. data/spec/git/scraper_spec.rb +0 -151
  75. data/spec/git/ssh_spec.rb +0 -174
  76. data/spec/git/url_spec.rb +0 -103
  77. data/spec/logger_spec.rb +0 -185
  78. data/spec/repository_spec.rb +0 -111
  79. data/spec/retriever_spec_helper.rb +0 -146
  80. data/spec/scanner_spec.rb +0 -61
  81. data/spec/scraper_helper.rb +0 -88
  82. data/spec/scraper_spec.rb +0 -147
  83. data/spec/spec_helper.rb +0 -185
  84. data/spec/svn/cookbook_spec.rb +0 -96
  85. data/spec/svn/multi_svn_spec.rb +0 -64
  86. data/spec/svn/multi_svn_spec_helper.rb +0 -40
  87. data/spec/svn/repository_spec.rb +0 -72
  88. data/spec/svn/retriever_spec.rb +0 -266
  89. data/spec/svn/scraper_spec.rb +0 -90
  90. data/spec/svn/svn_retriever_spec_helper.rb +0 -90
  91. data/spec/svn/url_spec.rb +0 -47
  92. data/spec/url_spec.rb +0 -164
@@ -0,0 +1,178 @@
1
+ #--
2
+ # Copyright: Copyright (c) 2010-2013 RightScale, Inc.
3
+ #
4
+ # Permission is hereby granted, free of charge, to any person obtaining
5
+ # a copy of this software and associated documentation files (the
6
+ # 'Software'), to deal in the Software without restriction, including
7
+ # without limitation the rights to use, copy, modify, merge, publish,
8
+ # distribute, sublicense, and/or sell copies of the Software, and to
9
+ # permit persons to whom the Software is furnished to do so, subject to
10
+ # the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be
13
+ # included in all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
16
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
18
+ # IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
19
+ # CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
20
+ # TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
21
+ # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22
+ #++
23
+
24
+ # ancestor
25
+ require 'right_scraper/retrievers'
26
+
27
+ require 'fileutils'
28
+
29
+ module RightScraper::Retrievers
30
+
31
+ # Base class for retrievers that want to do version control operations
32
+ # (CVS, SVN, etc.). Subclasses can get away with implementing only
33
+ # Retrievers::Base#available? and #do_checkout but to support incremental
34
+ # operation need to implement #exists? and #do_update, in addition to
35
+ # Retrievers::Base#ignorable_paths.
36
+ class CheckoutBase < ::RightScraper::Retrievers::Base
37
+
38
+ # Attempts to update and then resorts to clean checkout for repository.
39
+ def retrieve
40
+ raise RetrieverError.new("retriever is unavailable") unless available?
41
+ updated = false
42
+ explanation = ''
43
+ if exists?
44
+ @logger.operation(:updating) do
45
+ # a retriever may be able to determine that the repo directory is
46
+ # already pointing to the same commit as the revision. in that case
47
+ # we can return quickly.
48
+ if remote_differs?
49
+ # there is no point in updating and failing the size check when the
50
+ # directory on disk already exceeds size limit; fall back to a clean
51
+ # checkout in hopes that the latest revision corrects the issue.
52
+ if size_limit_exceeded?
53
+ explanation = 'switching to checkout due to existing directory exceeding size limimt'
54
+ else
55
+ # attempt update.
56
+ begin
57
+ do_update
58
+ updated = true
59
+ rescue ::RightScraper::Processes::Shell::LimitError
60
+ # update exceeded a limitation; requires user intervention
61
+ raise
62
+ rescue Exception => e
63
+ # retry with clean checkout after discarding repo dir.
64
+ explanation = 'switching to checkout after unsuccessful update'
65
+ end
66
+ end
67
+ else
68
+ # no retrieval needed but warn exactly why we didn't do full
69
+ # checkout to avoid being challenged about it.
70
+ repo_ref = @repository.tag
71
+ do_update_tag
72
+ full_head_ref = @repository.tag
73
+ abbreviated_head_ref = full_head_ref[0..6]
74
+ if repo_ref == full_head_ref || repo_ref == abbreviated_head_ref
75
+ detail = abbreviated_head_ref
76
+ else
77
+ detail = "#{repo_ref} = #{abbreviated_head_ref}"
78
+ end
79
+ message =
80
+ "Skipped updating local directory due to the HEAD commit SHA " +
81
+ "on local matching the remote repository reference (#{detail})."
82
+ @logger.note_warning(message)
83
+ return false
84
+ end
85
+ end
86
+ end
87
+
88
+ # Clean checkout only if not updated.
89
+ unless updated
90
+ @logger.operation(:checkout, explanation) do
91
+ # remove any full or partial directory before attempting a clean
92
+ # checkout in case repo_dir is in a bad state.
93
+ if exists?
94
+ ::FileUtils.remove_entry_secure(@repo_dir)
95
+ end
96
+ ::FileUtils.mkdir_p(@repo_dir)
97
+ begin
98
+ do_checkout
99
+ rescue Exception
100
+ # clean checkout failed; repo directory is in an undetermined
101
+ # state and must be deleted to prevent a future update attempt.
102
+ if exists?
103
+ ::FileUtils.remove_entry_secure(@repo_dir) rescue nil
104
+ end
105
+ raise
106
+ end
107
+ end
108
+ end
109
+ true
110
+ end
111
+
112
+ # Return true if a checkout exists.
113
+ #
114
+ # === Returns
115
+ # Boolean:: true if the checkout already exists (and thus
116
+ # incremental updating can occur).
117
+ def exists?
118
+ false
119
+ end
120
+
121
+ # Determines if the remote SHA/tag/branch referenced by the repostory
122
+ # differs from what appears on disk, if possible. Not all retrievers will
123
+ # have this capability. If not, the retriever should default to returning
124
+ # true to indicate that the remote is changed.
125
+ #
126
+ # @return [TrueClass|FalseClass] true if changed
127
+ def remote_differs?
128
+ true
129
+ end
130
+
131
+ # Determines if total size of files in repo_dir has exceeded size limit.
132
+ #
133
+ # === Return
134
+ # @return [TrueClass|FalseClass] true if size limit exceeded
135
+ def size_limit_exceeded?
136
+ if @max_bytes
137
+ # note that Dir.glob ignores hidden directories (e.g. ".git") so the
138
+ # size total correctly excludes those hidden contents that are not to
139
+ # be uploaded after scrape. this may cause the on-disk directory size
140
+ # to far exceed the upload size.
141
+ globbie = ::File.join(@repo_dir, '**/*')
142
+ size = 0
143
+ ::Dir.glob(globbie) do |f|
144
+ size += ::File.stat(f).size rescue 0 if ::File.file?(f)
145
+ break if size > @max_bytes
146
+ end
147
+ size > @max_bytes
148
+ else
149
+ false
150
+ end
151
+ end
152
+
153
+ # Perform a de novo full checkout of the repository. Subclasses
154
+ # must override this to do anything useful.
155
+ #
156
+ # @return [TrueClass] always true
157
+ def do_checkout
158
+ raise NotImplementedError
159
+ end
160
+
161
+ # Perform an incremental update of the checkout. Subclasses that
162
+ # want to handle incremental updating need to override this.
163
+ #
164
+ # @return [TrueClass] always true
165
+ def do_update
166
+ raise NotImplementedError
167
+ end
168
+
169
+ # Updates the tag of the repository associated with this retriever to refer
170
+ # to the HEAD commit (SHA) on disk after retrieval.
171
+ #
172
+ # @return [TrueClass] always true
173
+ def do_update_tag
174
+ raise NotImplementedError
175
+ end
176
+
177
+ end
178
+ end
@@ -1,5 +1,5 @@
1
1
  #--
2
- # Copyright: Copyright (c) 2010-2011 RightScale, Inc.
2
+ # Copyright: Copyright (c) 2010-2013 RightScale, Inc.
3
3
  #
4
4
  # Permission is hereby granted, free of charge, to any person obtaining
5
5
  # a copy of this software and associated documentation files (the
@@ -21,68 +21,110 @@
21
21
  # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22
22
  #++
23
23
 
24
+ # ancestor
25
+ require 'right_scraper/retrievers'
26
+
27
+ require 'fileutils'
24
28
  require 'tempfile'
25
29
  require 'digest/sha1'
26
30
  require 'right_popen'
27
31
  require 'right_popen/safe_output_buffer'
28
32
 
29
- module RightScraper
30
- module Retrievers
31
- # A retriever for resources stored in archives on a web server
32
- # somewhere. Uses command line curl and command line tar.
33
- class Download < Base
33
+ module RightScraper::Retrievers
34
34
 
35
- class DownloadError < Exception; end
35
+ # A retriever for resources stored in archives on a web server
36
+ # somewhere. Uses command line curl and command line tar.
37
+ class Download < ::RightScraper::Retrievers::Base
36
38
 
37
- @@available = false
39
+ class DownloadError < Exception; end
38
40
 
39
- # Determines if downloader is available.
40
- def available?
41
- unless @@available
42
- begin
43
- # FIX: we might want to parse the result and require a minimum curl
44
- # version.
45
- cmd = "curl --version"
46
- `#{cmd}`
47
- if $?.success?
48
- @@available = true
49
- else
50
- raise RetrieverError, "\"#{cmd}\" exited with #{$?.exitstatus}"
51
- end
52
- rescue
53
- @logger.note_error($!, :available, "download retriever is unavailable")
41
+ @@available = false
42
+
43
+ # Determines if downloader is available.
44
+ def available?
45
+ unless @@available
46
+ begin
47
+ # FIX: we might want to parse the result and require a minimum curl
48
+ # version.
49
+ cmd = "curl --version"
50
+ `#{cmd}`
51
+ if $?.success?
52
+ @@available = true
53
+ else
54
+ raise RetrieverError, "\"#{cmd}\" exited with #{$?.exitstatus}"
54
55
  end
56
+ rescue
57
+ @logger.note_error($!, :available, "download retriever is unavailable")
55
58
  end
56
- @@available
57
59
  end
60
+ @@available
61
+ end
62
+
63
+ # Directory used to download tarballs
64
+ def workdir
65
+ @workdir ||= ::File.join(::File.dirname(@repo_dir), 'download')
66
+ end
58
67
 
59
- # Directory used to download tarballs
60
- def workdir
61
- @workdir ||= ::File.join(::File.dirname(@repo_dir), 'download')
68
+ # Download tarball and unpack it
69
+ def retrieve
70
+ raise RetrieverError.new("download retriever is unavailable") unless available?
71
+ ::FileUtils.remove_entry_secure @repo_dir if File.exists?(@repo_dir)
72
+ ::FileUtils.remove_entry_secure workdir if File.exists?(workdir)
73
+ ::FileUtils.mkdir_p @repo_dir
74
+ ::FileUtils.mkdir_p workdir
75
+ file = ::File.join(workdir, "package")
76
+
77
+ # TEAL FIX: we have to always-download the tarball before we can
78
+ # determine if contents have changed, but afterward we can compare the
79
+ # previous download against the latest downloaded and short-circuit the
80
+ # remaining flow for the no-difference case.
81
+ @logger.operation(:downloading) do
82
+ credential_command = if @repository.first_credential && @repository.second_credential
83
+ ['-u', "#{@repository.first_credential}:#{@repository.second_credential}"]
84
+ else
85
+ []
86
+ end
87
+ @output = ::RightScale::RightPopen::SafeOutputBuffer.new
88
+ @cmd = [
89
+ 'curl',
90
+ '--silent', '--show-error', '--location', '--fail',
91
+ '--location-trusted', '-o', file, credential_command,
92
+ @repository.url
93
+ ].flatten
94
+ begin
95
+ ::RightScale::RightPopen.popen3_sync(
96
+ @cmd,
97
+ :target => self,
98
+ :pid_handler => :pid_download,
99
+ :timeout_handler => :timeout_download,
100
+ :size_limit_handler => :size_limit_download,
101
+ :exit_handler => :exit_download,
102
+ :stderr_handler => :output_download,
103
+ :stdout_handler => :output_download,
104
+ :inherit_io => true, # avoid killing any rails connection
105
+ :watch_directory => workdir,
106
+ :size_limit_bytes => @max_bytes,
107
+ :timeout_seconds => @max_seconds)
108
+ rescue Exception => e
109
+ @logger.note_phase(:abort, :running_command, 'curl', e)
110
+ raise
111
+ end
62
112
  end
63
113
 
64
- # Download tarball and unpack it
65
- def retrieve
66
- raise RetrieverError.new("download retriever is unavailable") unless available?
67
- FileUtils.remove_entry_secure @repo_dir if File.exists?(@repo_dir)
68
- FileUtils.remove_entry_secure workdir if File.exists?(workdir)
69
- FileUtils.mkdir_p @repo_dir
70
- FileUtils.mkdir_p workdir
71
- file = File.join(workdir, "package")
72
-
73
- @logger.operation(:downloading) do
74
- credential_command = if @repository.first_credential && @repository.second_credential
75
- ['-u', "#{@repository.first_credential}:#{@repository.second_credential}"]
76
- else
77
- []
78
- end
114
+ note_tag(file)
115
+
116
+ @logger.operation(:unpacking) do
117
+ path = @repository.to_url.path
118
+ if path =~ /\.gz$/
119
+ extraction = "xzf"
120
+ elsif path =~ /\.bz2$/
121
+ extraction = "xjf"
122
+ else
123
+ extraction = "xf"
124
+ end
125
+ Dir.chdir(@repo_dir) do
79
126
  @output = ::RightScale::RightPopen::SafeOutputBuffer.new
80
- @cmd = [
81
- 'curl',
82
- '--silent', '--show-error', '--location', '--fail',
83
- '--location-trusted', '-o', file, credential_command,
84
- @repository.url
85
- ].flatten
127
+ @cmd = ['tar', extraction, file]
86
128
  begin
87
129
  ::RightScale::RightPopen.popen3_sync(
88
130
  @cmd,
@@ -94,90 +136,56 @@ module RightScraper
94
136
  :stderr_handler => :output_download,
95
137
  :stdout_handler => :output_download,
96
138
  :inherit_io => true, # avoid killing any rails connection
97
- :watch_directory => workdir,
139
+ :watch_directory => @repo_dir,
98
140
  :size_limit_bytes => @max_bytes,
99
141
  :timeout_seconds => @max_seconds)
100
142
  rescue Exception => e
101
- @logger.note_phase(:abort, :running_command, 'curl', e)
143
+ @logger.note_phase(:abort, :running_command, @cmd.first, e)
102
144
  raise
103
145
  end
104
146
  end
105
-
106
- note_tag(file)
107
-
108
- @logger.operation(:unpacking) do
109
- path = @repository.to_url.path
110
- if path =~ /\.gz$/
111
- extraction = "xzf"
112
- elsif path =~ /\.bz2$/
113
- extraction = "xjf"
114
- else
115
- extraction = "xf"
116
- end
117
- Dir.chdir(@repo_dir) do
118
- @output = ::RightScale::RightPopen::SafeOutputBuffer.new
119
- @cmd = ['tar', extraction, file]
120
- begin
121
- ::RightScale::RightPopen.popen3_sync(
122
- @cmd,
123
- :target => self,
124
- :pid_handler => :pid_download,
125
- :timeout_handler => :timeout_download,
126
- :size_limit_handler => :size_limit_download,
127
- :exit_handler => :exit_download,
128
- :stderr_handler => :output_download,
129
- :stdout_handler => :output_download,
130
- :inherit_io => true, # avoid killing any rails connection
131
- :watch_directory => @repo_dir,
132
- :size_limit_bytes => @max_bytes,
133
- :timeout_seconds => @max_seconds)
134
- rescue Exception => e
135
- @logger.note_phase(:abort, :running_command, @cmd.first, e)
136
- raise
137
- end
138
- end
139
- end
140
147
  end
148
+ true
149
+ end
141
150
 
142
- def pid_download(pid)
143
- @logger.note_phase(:begin, :running_command, @cmd.first)
144
- true
145
- end
151
+ def pid_download(pid)
152
+ @logger.note_phase(:begin, :running_command, @cmd.first)
153
+ true
154
+ end
146
155
 
147
- def output_download(data)
148
- @output.safe_buffer_data(data)
149
- end
156
+ def output_download(data)
157
+ @output.safe_buffer_data(data)
158
+ end
150
159
 
151
- def timeout_download
152
- raise DownloadError, "Downloader timed out"
153
- end
160
+ def timeout_download
161
+ raise DownloadError, "Downloader timed out"
162
+ end
154
163
 
155
- def size_limit_download
156
- raise DownloadError, "Downloader exceeded size limit"
157
- end
164
+ def size_limit_download
165
+ raise DownloadError, "Downloader exceeded size limit"
166
+ end
158
167
 
159
- def exit_download(status)
160
- unless status.success?
161
- @output.safe_buffer_data("Exit code = #{status.exitstatus}")
162
- raise DownloadError, "Downloader failed: #{@output.display_text}"
163
- end
164
- @logger.note_phase(:commit, :running_command, @cmd.first)
165
- true
168
+ def exit_download(status)
169
+ unless status.success?
170
+ @output.safe_buffer_data("Exit code = #{status.exitstatus}")
171
+ raise DownloadError, "Downloader failed: #{@output.display_text}"
166
172
  end
173
+ @logger.note_phase(:commit, :running_command, @cmd.first)
174
+ true
175
+ end
167
176
 
168
177
 
169
- # Amend @repository with the tag information from the downloaded
170
- # file.
171
- #
172
- # === Parameters
173
- # file(String):: file that was downloaded
174
- def note_tag(file)
175
- digest = Digest::SHA1.new
176
- File.open(file) {|f| digest << f.read(4096) }
177
- repo = @repository.clone
178
- repo.tag = digest.hexdigest
179
- @repository = repo
180
- end
178
+ # Amend @repository with the tag information from the downloaded
179
+ # file.
180
+ #
181
+ # === Parameters
182
+ # file(String):: file that was downloaded
183
+ def note_tag(file)
184
+ digest = Digest::SHA1.new
185
+ File.open(file) {|f| digest << f.read(4096) }
186
+ repo = @repository.clone
187
+ repo.tag = digest.hexdigest
188
+ @repository = repo
181
189
  end
182
190
  end
183
191
  end