powerdlz23 1.2.3 → 1.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (207) hide show
  1. package/Spider/README.md +19 -0
  2. package/Spider/domain.py +18 -0
  3. package/Spider/general.py +51 -0
  4. package/Spider/link_finder.py +25 -0
  5. package/Spider/main.py +50 -0
  6. package/Spider/spider.py +74 -0
  7. package/crawler/.formatter.exs +5 -0
  8. package/crawler/.github/workflows/ci.yml +29 -0
  9. package/crawler/.recode.exs +33 -0
  10. package/crawler/.tool-versions +2 -0
  11. package/crawler/CHANGELOG.md +82 -0
  12. package/crawler/README.md +198 -0
  13. package/crawler/architecture.svg +4 -0
  14. package/crawler/config/config.exs +9 -0
  15. package/crawler/config/dev.exs +5 -0
  16. package/crawler/config/test.exs +5 -0
  17. package/crawler/examples/google_search/scraper.ex +37 -0
  18. package/crawler/examples/google_search/url_filter.ex +11 -0
  19. package/crawler/examples/google_search.ex +77 -0
  20. package/crawler/lib/crawler/dispatcher/worker.ex +14 -0
  21. package/crawler/lib/crawler/dispatcher.ex +20 -0
  22. package/crawler/lib/crawler/fetcher/header_preparer.ex +60 -0
  23. package/crawler/lib/crawler/fetcher/modifier.ex +45 -0
  24. package/crawler/lib/crawler/fetcher/policer.ex +77 -0
  25. package/crawler/lib/crawler/fetcher/recorder.ex +55 -0
  26. package/crawler/lib/crawler/fetcher/requester.ex +32 -0
  27. package/crawler/lib/crawler/fetcher/retrier.ex +43 -0
  28. package/crawler/lib/crawler/fetcher/url_filter.ex +26 -0
  29. package/crawler/lib/crawler/fetcher.ex +81 -0
  30. package/crawler/lib/crawler/http.ex +7 -0
  31. package/crawler/lib/crawler/linker/path_builder.ex +71 -0
  32. package/crawler/lib/crawler/linker/path_expander.ex +59 -0
  33. package/crawler/lib/crawler/linker/path_finder.ex +106 -0
  34. package/crawler/lib/crawler/linker/path_offliner.ex +59 -0
  35. package/crawler/lib/crawler/linker/path_prefixer.ex +46 -0
  36. package/crawler/lib/crawler/linker.ex +173 -0
  37. package/crawler/lib/crawler/options.ex +127 -0
  38. package/crawler/lib/crawler/parser/css_parser.ex +37 -0
  39. package/crawler/lib/crawler/parser/guarder.ex +38 -0
  40. package/crawler/lib/crawler/parser/html_parser.ex +41 -0
  41. package/crawler/lib/crawler/parser/link_parser/link_expander.ex +32 -0
  42. package/crawler/lib/crawler/parser/link_parser.ex +50 -0
  43. package/crawler/lib/crawler/parser.ex +122 -0
  44. package/crawler/lib/crawler/queue_handler.ex +45 -0
  45. package/crawler/lib/crawler/scraper.ex +28 -0
  46. package/crawler/lib/crawler/snapper/dir_maker.ex +45 -0
  47. package/crawler/lib/crawler/snapper/link_replacer.ex +95 -0
  48. package/crawler/lib/crawler/snapper.ex +82 -0
  49. package/crawler/lib/crawler/store/counter.ex +19 -0
  50. package/crawler/lib/crawler/store/page.ex +7 -0
  51. package/crawler/lib/crawler/store.ex +87 -0
  52. package/crawler/lib/crawler/worker.ex +62 -0
  53. package/crawler/lib/crawler.ex +91 -0
  54. package/crawler/mix.exs +78 -0
  55. package/crawler/mix.lock +40 -0
  56. package/crawler/test/fixtures/introducing-elixir.jpg +0 -0
  57. package/crawler/test/integration_test.exs +135 -0
  58. package/crawler/test/lib/crawler/dispatcher/worker_test.exs +7 -0
  59. package/crawler/test/lib/crawler/dispatcher_test.exs +5 -0
  60. package/crawler/test/lib/crawler/fetcher/header_preparer_test.exs +7 -0
  61. package/crawler/test/lib/crawler/fetcher/policer_test.exs +71 -0
  62. package/crawler/test/lib/crawler/fetcher/recorder_test.exs +9 -0
  63. package/crawler/test/lib/crawler/fetcher/requester_test.exs +9 -0
  64. package/crawler/test/lib/crawler/fetcher/retrier_test.exs +7 -0
  65. package/crawler/test/lib/crawler/fetcher/url_filter_test.exs +7 -0
  66. package/crawler/test/lib/crawler/fetcher_test.exs +153 -0
  67. package/crawler/test/lib/crawler/http_test.exs +47 -0
  68. package/crawler/test/lib/crawler/linker/path_builder_test.exs +7 -0
  69. package/crawler/test/lib/crawler/linker/path_expander_test.exs +7 -0
  70. package/crawler/test/lib/crawler/linker/path_finder_test.exs +7 -0
  71. package/crawler/test/lib/crawler/linker/path_offliner_test.exs +7 -0
  72. package/crawler/test/lib/crawler/linker/path_prefixer_test.exs +7 -0
  73. package/crawler/test/lib/crawler/linker_test.exs +7 -0
  74. package/crawler/test/lib/crawler/options_test.exs +7 -0
  75. package/crawler/test/lib/crawler/parser/css_parser_test.exs +7 -0
  76. package/crawler/test/lib/crawler/parser/guarder_test.exs +7 -0
  77. package/crawler/test/lib/crawler/parser/html_parser_test.exs +7 -0
  78. package/crawler/test/lib/crawler/parser/link_parser/link_expander_test.exs +7 -0
  79. package/crawler/test/lib/crawler/parser/link_parser_test.exs +7 -0
  80. package/crawler/test/lib/crawler/parser_test.exs +8 -0
  81. package/crawler/test/lib/crawler/queue_handler_test.exs +7 -0
  82. package/crawler/test/lib/crawler/scraper_test.exs +7 -0
  83. package/crawler/test/lib/crawler/snapper/dir_maker_test.exs +7 -0
  84. package/crawler/test/lib/crawler/snapper/link_replacer_test.exs +7 -0
  85. package/crawler/test/lib/crawler/snapper_test.exs +9 -0
  86. package/crawler/test/lib/crawler/worker_test.exs +5 -0
  87. package/crawler/test/lib/crawler_test.exs +295 -0
  88. package/crawler/test/support/test_case.ex +24 -0
  89. package/crawler/test/support/test_helpers.ex +28 -0
  90. package/crawler/test/test_helper.exs +7 -0
  91. package/grell/.rspec +2 -0
  92. package/grell/.travis.yml +28 -0
  93. package/grell/CHANGELOG.md +111 -0
  94. package/grell/Gemfile +7 -0
  95. package/grell/LICENSE.txt +22 -0
  96. package/grell/README.md +213 -0
  97. package/grell/Rakefile +2 -0
  98. package/grell/grell.gemspec +36 -0
  99. package/grell/lib/grell/capybara_driver.rb +44 -0
  100. package/grell/lib/grell/crawler.rb +83 -0
  101. package/grell/lib/grell/crawler_manager.rb +84 -0
  102. package/grell/lib/grell/grell_logger.rb +10 -0
  103. package/grell/lib/grell/page.rb +275 -0
  104. package/grell/lib/grell/page_collection.rb +62 -0
  105. package/grell/lib/grell/rawpage.rb +62 -0
  106. package/grell/lib/grell/reader.rb +18 -0
  107. package/grell/lib/grell/version.rb +3 -0
  108. package/grell/lib/grell.rb +11 -0
  109. package/grell/spec/lib/capybara_driver_spec.rb +38 -0
  110. package/grell/spec/lib/crawler_manager_spec.rb +174 -0
  111. package/grell/spec/lib/crawler_spec.rb +361 -0
  112. package/grell/spec/lib/page_collection_spec.rb +159 -0
  113. package/grell/spec/lib/page_spec.rb +418 -0
  114. package/grell/spec/lib/reader_spec.rb +43 -0
  115. package/grell/spec/spec_helper.rb +66 -0
  116. package/heartmagic/config.py +1 -0
  117. package/heartmagic/heart.py +3 -0
  118. package/heartmagic/pytransform/__init__.py +483 -0
  119. package/heartmagic/pytransform/_pytransform.dll +0 -0
  120. package/heartmagic/pytransform/_pytransform.so +0 -0
  121. package/httpStatusCode/README.md +2 -0
  122. package/httpStatusCode/httpStatusCode.js +4 -0
  123. package/httpStatusCode/reasonPhrases.js +344 -0
  124. package/httpStatusCode/statusCodes.js +344 -0
  125. package/package.json +1 -1
  126. package/rubyretriever/.rspec +2 -0
  127. package/rubyretriever/.travis.yml +7 -0
  128. package/rubyretriever/Gemfile +3 -0
  129. package/rubyretriever/Gemfile.lock +64 -0
  130. package/rubyretriever/LICENSE +20 -0
  131. package/rubyretriever/Rakefile +7 -0
  132. package/rubyretriever/bin/rr +79 -0
  133. package/rubyretriever/lib/retriever/cli.rb +25 -0
  134. package/rubyretriever/lib/retriever/core_ext.rb +13 -0
  135. package/rubyretriever/lib/retriever/fetch.rb +268 -0
  136. package/rubyretriever/lib/retriever/fetchfiles.rb +71 -0
  137. package/rubyretriever/lib/retriever/fetchseo.rb +18 -0
  138. package/rubyretriever/lib/retriever/fetchsitemap.rb +43 -0
  139. package/rubyretriever/lib/retriever/link.rb +47 -0
  140. package/rubyretriever/lib/retriever/openuri_redirect_patch.rb +8 -0
  141. package/rubyretriever/lib/retriever/page.rb +104 -0
  142. package/rubyretriever/lib/retriever/page_iterator.rb +21 -0
  143. package/rubyretriever/lib/retriever/target.rb +47 -0
  144. package/rubyretriever/lib/retriever/version.rb +4 -0
  145. package/rubyretriever/lib/retriever.rb +15 -0
  146. package/rubyretriever/readme.md +166 -0
  147. package/rubyretriever/rubyretriever.gemspec +41 -0
  148. package/rubyretriever/spec/link_spec.rb +77 -0
  149. package/rubyretriever/spec/page_spec.rb +94 -0
  150. package/rubyretriever/spec/retriever_spec.rb +84 -0
  151. package/rubyretriever/spec/spec_helper.rb +17 -0
  152. package/rubyretriever/spec/target_spec.rb +55 -0
  153. package/snapcrawl/.changelog.old.md +157 -0
  154. package/snapcrawl/.gitattributes +1 -0
  155. package/snapcrawl/.github/workflows/test.yml +41 -0
  156. package/snapcrawl/.rspec +3 -0
  157. package/snapcrawl/.rubocop.yml +23 -0
  158. package/snapcrawl/CHANGELOG.md +182 -0
  159. package/snapcrawl/Gemfile +15 -0
  160. package/snapcrawl/LICENSE +21 -0
  161. package/snapcrawl/README.md +135 -0
  162. package/snapcrawl/Runfile +35 -0
  163. package/snapcrawl/bin/snapcrawl +25 -0
  164. package/snapcrawl/lib/snapcrawl/cli.rb +52 -0
  165. package/snapcrawl/lib/snapcrawl/config.rb +60 -0
  166. package/snapcrawl/lib/snapcrawl/crawler.rb +98 -0
  167. package/snapcrawl/lib/snapcrawl/dependencies.rb +21 -0
  168. package/snapcrawl/lib/snapcrawl/exceptions.rb +5 -0
  169. package/snapcrawl/lib/snapcrawl/log_helpers.rb +36 -0
  170. package/snapcrawl/lib/snapcrawl/page.rb +118 -0
  171. package/snapcrawl/lib/snapcrawl/pretty_logger.rb +11 -0
  172. package/snapcrawl/lib/snapcrawl/refinements/pair_split.rb +26 -0
  173. package/snapcrawl/lib/snapcrawl/refinements/string_refinements.rb +13 -0
  174. package/snapcrawl/lib/snapcrawl/screenshot.rb +73 -0
  175. package/snapcrawl/lib/snapcrawl/templates/config.yml +49 -0
  176. package/snapcrawl/lib/snapcrawl/templates/docopt.txt +26 -0
  177. package/snapcrawl/lib/snapcrawl/version.rb +3 -0
  178. package/snapcrawl/lib/snapcrawl.rb +20 -0
  179. package/snapcrawl/snapcrawl.gemspec +27 -0
  180. package/snapcrawl/snapcrawl.yml +41 -0
  181. package/snapcrawl/spec/README.md +16 -0
  182. package/snapcrawl/spec/approvals/bin/help +26 -0
  183. package/snapcrawl/spec/approvals/bin/usage +4 -0
  184. package/snapcrawl/spec/approvals/cli/usage +4 -0
  185. package/snapcrawl/spec/approvals/config/defaults +15 -0
  186. package/snapcrawl/spec/approvals/config/minimal +15 -0
  187. package/snapcrawl/spec/approvals/integration/blacklist +14 -0
  188. package/snapcrawl/spec/approvals/integration/default-config +14 -0
  189. package/snapcrawl/spec/approvals/integration/depth-0 +6 -0
  190. package/snapcrawl/spec/approvals/integration/depth-3 +6 -0
  191. package/snapcrawl/spec/approvals/integration/log-color-no +6 -0
  192. package/snapcrawl/spec/approvals/integration/screenshot-error +3 -0
  193. package/snapcrawl/spec/approvals/integration/whitelist +14 -0
  194. package/snapcrawl/spec/approvals/models/pretty_logger/colors +1 -0
  195. package/snapcrawl/spec/fixtures/config/minimal.yml +4 -0
  196. package/snapcrawl/spec/server/config.ru +97 -0
  197. package/snapcrawl/spec/snapcrawl/bin_spec.rb +15 -0
  198. package/snapcrawl/spec/snapcrawl/cli_spec.rb +9 -0
  199. package/snapcrawl/spec/snapcrawl/config_spec.rb +26 -0
  200. package/snapcrawl/spec/snapcrawl/integration_spec.rb +65 -0
  201. package/snapcrawl/spec/snapcrawl/page_spec.rb +89 -0
  202. package/snapcrawl/spec/snapcrawl/pretty_logger_spec.rb +19 -0
  203. package/snapcrawl/spec/snapcrawl/refinements/pair_split_spec.rb +27 -0
  204. package/snapcrawl/spec/snapcrawl/refinements/string_refinements_spec.rb +29 -0
  205. package/snapcrawl/spec/snapcrawl/screenshot_spec.rb +62 -0
  206. package/snapcrawl/spec/spec_helper.rb +22 -0
  207. package/snapcrawl/spec/spec_mixin.rb +10 -0
@@ -0,0 +1,52 @@
1
+ require 'colsole'
2
+ require 'docopt'
3
+ require 'fileutils'
4
+
5
+ module Snapcrawl
6
+ class CLI
7
+ include Colsole
8
+ using StringRefinements
9
+ using PairSplit
10
+
11
+ def call(args = [])
12
+ execute Docopt.docopt(docopt, version: VERSION, argv: args)
13
+ rescue Docopt::Exit => e
14
+ puts e.message
15
+ end
16
+
17
+ private
18
+
19
+ def execute(args)
20
+ config_file = args['--config']
21
+ Config.load config_file if config_file
22
+
23
+ tweaks = args['SETTINGS'].pair_split
24
+ apply_tweaks tweaks if tweaks
25
+
26
+ Dependencies.verify
27
+
28
+ $logger.debug 'initializing cli'
29
+ FileUtils.mkdir_p Config.snaps_dir
30
+
31
+ url = args['URL'].protocolize
32
+ crawler = Crawler.new url
33
+
34
+ crawler.crawl
35
+ end
36
+
37
+ def docopt
38
+ @docopt ||= File.read docopt_path
39
+ end
40
+
41
+ def docopt_path
42
+ File.expand_path 'templates/docopt.txt', __dir__
43
+ end
44
+
45
+ def apply_tweaks(tweaks)
46
+ tweaks.each do |key, value|
47
+ Config.settings[key] = value
48
+ $logger.level = value if key == 'log_level'
49
+ end
50
+ end
51
+ end
52
+ end
@@ -0,0 +1,60 @@
1
+ require 'sting'
2
+ require 'fileutils'
3
+
4
+ module Snapcrawl
5
+ class Config < Sting
6
+ class << self
7
+ def load(file = nil)
8
+ reset!
9
+ push defaults
10
+
11
+ return unless file
12
+
13
+ file = "#{file}.yml" unless /\.ya?ml$/.match?(file)
14
+
15
+ # FIXME: Cannot use logger here due to the "chicken and egg" with
16
+ # Config. The $logger is available, but it was not yet fully
17
+ # configured with log_level etc.
18
+ if File.exist? file
19
+ # $logger.debug "loading config file g`#{file}`"
20
+ push file
21
+ else
22
+ # $logger.debug "creating config file g`#{file}`"
23
+ create_config file
24
+ end
25
+ end
26
+
27
+ private
28
+
29
+ def defaults
30
+ {
31
+ depth: 1,
32
+ width: 1280,
33
+ height: 0,
34
+ cache_life: 86_400,
35
+ cache_dir: 'cache',
36
+ snaps_dir: 'snaps',
37
+ name_template: '%{url}',
38
+ url_whitelist: nil,
39
+ url_blacklist: nil,
40
+ css_selector: nil,
41
+ log_level: 1,
42
+ log_color: 'auto',
43
+ skip_ssl_verification: false,
44
+ screenshot_delay: nil,
45
+ }
46
+ end
47
+
48
+ def create_config(file)
49
+ content = File.read config_template
50
+ dir = File.dirname file
51
+ FileUtils.mkdir_p dir
52
+ File.write file, content
53
+ end
54
+
55
+ def config_template
56
+ File.expand_path 'templates/config.yml', __dir__
57
+ end
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,98 @@
1
+ require 'fileutils'
2
+
3
+ module Snapcrawl
4
+ class Crawler
5
+ using StringRefinements
6
+
7
+ attr_reader :url
8
+
9
+ def initialize(url)
10
+ $logger.debug "initializing crawler with g`#{url}`"
11
+
12
+ config_for_display = Config.settings.dup
13
+ config_for_display['name_template'] = '%%{url}'
14
+
15
+ $logger.debug "config #{config_for_display}"
16
+ @url = url
17
+ end
18
+
19
+ def crawl
20
+ Dependencies.verify
21
+ todo[url] = Page.new url
22
+ process_todo while todo.any?
23
+ end
24
+
25
+ private
26
+
27
+ def process_todo
28
+ $logger.debug "processing queue: g`#{todo.count} remaining`"
29
+
30
+ url, page = todo.shift
31
+ done.push url
32
+
33
+ return unless process_page page
34
+
35
+ register_sub_pages page.pages if page.depth < Config.depth
36
+ end
37
+
38
+ def register_sub_pages(pages)
39
+ pages.each do |sub_page|
40
+ next if todo.has_key?(sub_page) || done.include?(sub_page)
41
+
42
+ if Config.url_whitelist && sub_page.path !~ (/#{Config.url_whitelist}/)
43
+ $logger.debug "ignoring mu`#{sub_page.url}`, reason: whitelist"
44
+ next
45
+ end
46
+
47
+ if Config.url_blacklist && sub_page.path =~ (/#{Config.url_blacklist}/)
48
+ $logger.debug "ignoring mu`#{sub_page.url}`, reason: blacklist"
49
+ next
50
+ end
51
+
52
+ todo[sub_page.url] = sub_page
53
+ end
54
+ end
55
+
56
+ def process_page(page)
57
+ outfile = "#{Config.snaps_dir}/#{Config.name_template}.png" % { url: page.url.to_slug }
58
+
59
+ $logger.info "processing mu`#{page.url}`, depth: #{page.depth}"
60
+
61
+ unless page.valid?
62
+ $logger.debug "page #{page.path} is invalid, aborting process"
63
+ return false
64
+ end
65
+
66
+ if file_fresh? outfile
67
+ $logger.info "screenshot for #{page.path} already exists"
68
+ else
69
+ $logger.info "gb`capturing screenshot for #{page.path}`"
70
+ save_screenshot page, outfile
71
+ end
72
+
73
+ true
74
+ end
75
+
76
+ def save_screenshot(page, outfile)
77
+ page.save_screenshot outfile
78
+ rescue => e
79
+ $logger.error "screenshot error on mu`#{page.path}` - r`#{e.class}`: #{e.message}"
80
+ end
81
+
82
+ def file_fresh?(file)
83
+ Config.cache_life.positive? and File.exist?(file) and file_age(file) < Config.cache_life
84
+ end
85
+
86
+ def file_age(file)
87
+ (Time.now - File.stat(file).mtime).to_i
88
+ end
89
+
90
+ def todo
91
+ @todo ||= {}
92
+ end
93
+
94
+ def done
95
+ @done ||= []
96
+ end
97
+ end
98
+ end
@@ -0,0 +1,21 @@
1
+ require 'colsole'
2
+
3
+ module Snapcrawl
4
+ class Dependencies
5
+ class << self
6
+ include Colsole
7
+
8
+ def verify
9
+ return if @verified
10
+
11
+ $logger.debug 'verifying g`phantomjs` is present'
12
+ raise MissingPhantomJS unless command_exist? 'phantomjs'
13
+
14
+ $logger.debug 'verifying g`imagemagick` is present'
15
+ raise MissingImageMagick unless command_exist? 'convert'
16
+
17
+ @verified = true
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,5 @@
1
+ module Snapcrawl
2
+ class MissingPhantomJS < StandardError; end
3
+ class MissingImageMagick < StandardError; end
4
+ class ScreenshotError < StandardError; end
5
+ end
@@ -0,0 +1,36 @@
1
+ require 'colsole'
2
+
3
+ module Snapcrawl
4
+ module LogHelpers
5
+ include Colsole
6
+
7
+ SEVERITY_COLORS = {
8
+ 'INFO' => :b,
9
+ 'WARN' => :y,
10
+ 'ERROR' => :r,
11
+ 'FATAL' => :r,
12
+ 'DEBUG' => :c,
13
+ }
14
+
15
+ def log_formatter
16
+ proc do |severity, _time, _prog, message|
17
+ severity_color = SEVERITY_COLORS[severity]
18
+ line = "#{severity_color}`#{severity.rjust 5}` : #{message}\n"
19
+ use_colors? ? colorize(line) : strip_colors(line)
20
+ end
21
+ end
22
+
23
+ def use_colors?
24
+ @use_colors ||= (Config.log_color == 'auto' ? tty? : Config.log_color)
25
+ end
26
+
27
+ def tty?
28
+ case ENV['TTY']
29
+ when 'on' then true
30
+ when 'off' then false
31
+ else
32
+ $stdout.tty?
33
+ end
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,118 @@
1
+ require 'addressable/uri'
2
+ require 'fileutils'
3
+ require 'httparty'
4
+ require 'lightly'
5
+ require 'nokogiri'
6
+
7
+ module Snapcrawl
8
+ class Page
9
+ using StringRefinements
10
+
11
+ attr_reader :url, :depth
12
+
13
+ EXTENSION_BLACKLIST = 'png|gif|jpg|pdf|zip'
14
+ PROTOCOL_BLACKLIST = 'mailto|tel'
15
+
16
+ def initialize(url, depth: 0)
17
+ @url = url.protocolize
18
+ @depth = depth
19
+ end
20
+
21
+ def valid?
22
+ http_response&.success?
23
+ end
24
+
25
+ def site
26
+ @site ||= Addressable::URI.parse(url).site
27
+ end
28
+
29
+ def path
30
+ @path ||= Addressable::URI.parse(url).request_uri
31
+ end
32
+
33
+ def links
34
+ return nil unless valid?
35
+
36
+ doc = Nokogiri::HTML http_response.body
37
+ normalize_links doc.css('a')
38
+ end
39
+
40
+ def pages
41
+ return nil unless valid?
42
+
43
+ links.map { |link| Page.new link, depth: depth + 1 }
44
+ end
45
+
46
+ def save_screenshot(outfile)
47
+ return false unless valid?
48
+
49
+ Screenshot.new(url).save outfile
50
+ end
51
+
52
+ private
53
+
54
+ def http_response
55
+ @http_response ||= http_response!
56
+ end
57
+
58
+ def http_response!
59
+ response = cache.get(url) { HTTParty.get url, httparty_options }
60
+
61
+ unless response.success?
62
+ $logger.warn "http error on mu`#{url}`, code: y`#{response.code}`, message: #{response.message.strip}"
63
+ end
64
+
65
+ response
66
+ rescue => e
67
+ $logger.error "http error on mu`#{url}` - r`#{e.class}`: #{e.message}"
68
+ nil
69
+ end
70
+
71
+ def httparty_options
72
+ Config.skip_ssl_verification ? { verify: false } : {}
73
+ end
74
+
75
+ def normalize_links(links)
76
+ result = []
77
+
78
+ links.each do |link|
79
+ valid_link = normalize_link link
80
+ result << valid_link if valid_link
81
+ end
82
+
83
+ result.uniq
84
+ end
85
+
86
+ def normalize_link(link)
87
+ link = link.attribute('href').to_s.dup
88
+
89
+ # Remove #hash
90
+ link.gsub!(/#.+$/, '')
91
+ return nil if link.empty?
92
+
93
+ # Remove links to specific extensions and protocols
94
+ return nil if /\.(#{EXTENSION_BLACKLIST})(\?.*)?$/o.match?(link)
95
+ return nil if /^(#{PROTOCOL_BLACKLIST}):/o.match?(link)
96
+
97
+ # Strip spaces
98
+ link.strip!
99
+
100
+ # Convert relative links to absolute
101
+ begin
102
+ link = Addressable::URI.join(url, link).to_s.dup
103
+ rescue => e
104
+ $logger.warn "r`#{e.class}`: #{e.message} on #{path} (link: #{link})"
105
+ return nil
106
+ end
107
+
108
+ # Keep only links in our base domain
109
+ return nil unless link.include? site
110
+
111
+ link
112
+ end
113
+
114
+ def cache
115
+ Lightly.new life: Config.cache_life
116
+ end
117
+ end
118
+ end
@@ -0,0 +1,11 @@
1
+ require 'logger'
2
+
3
+ module Snapcrawl
4
+ class PrettyLogger
5
+ extend LogHelpers
6
+
7
+ def self.new
8
+ Logger.new($stdout, formatter: log_formatter, level: Config.log_level)
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,26 @@
1
+ module Snapcrawl
2
+ module PairSplit
3
+ refine Array do
4
+ def pair_split
5
+ false_values = %w[no false]
6
+ true_values = %w[yes true]
7
+
8
+ to_h do |pair|
9
+ key, value = pair.split '='
10
+
11
+ value = if /^\d+$/.match?(value)
12
+ value.to_i
13
+ elsif false_values.include? value
14
+ false
15
+ elsif true_values.include? value
16
+ true
17
+ else
18
+ value
19
+ end
20
+
21
+ [key, value]
22
+ end
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,13 @@
1
+ module Snapcrawl
2
+ module StringRefinements
3
+ refine String do
4
+ def to_slug
5
+ downcase.gsub(/[^a-z0-9]+/, '-')
6
+ end
7
+
8
+ def protocolize
9
+ /^http/.match?(self) ? self : "http://#{self}"
10
+ end
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,73 @@
1
+ require 'webshot'
2
+
3
+ module Snapcrawl
4
+ class Screenshot
5
+ using StringRefinements
6
+
7
+ attr_reader :url
8
+
9
+ def initialize(url)
10
+ @url = url
11
+ end
12
+
13
+ def save(outfile = nil)
14
+ outfile ||= "#{url.to_slug}.png"
15
+ webshot_capture url, outfile
16
+ end
17
+
18
+ private
19
+
20
+ def webshot_capture(url, image_path)
21
+ webshot_capture! url, image_path
22
+ rescue => e
23
+ raise ScreenshotError, "#{e.class} #{e.message}"
24
+ end
25
+
26
+ def webshot_capture!(url, image_path)
27
+ hide_output do
28
+ webshot.capture url, image_path, webshot_options do |magick|
29
+ magick.combine_options do |c|
30
+ c.background 'white'
31
+ c.gravity 'north'
32
+ c.quality 100
33
+ c.extent Config.height.positive? ? "#{Config.width}x#{Config.height}" : "#{Config.width}x"
34
+ end
35
+ end
36
+ end
37
+ end
38
+
39
+ def webshot_options
40
+ result = { allowed_status_codes: [404, 401, 403] }
41
+
42
+ if Config.css_selector
43
+ result[:selector] = Config.css_selector
44
+ result[:full] = false
45
+ end
46
+
47
+ if Config.screenshot_delay
48
+ result[:timeout] = Config.screenshot_delay
49
+ end
50
+
51
+ result
52
+ end
53
+
54
+ def webshot
55
+ @webshot ||= Webshot::Screenshot.instance
56
+ end
57
+
58
+ # The webshot gem messes with stdout/stderr streams so we keep it in
59
+ # check by using this method. Also, in some sites (e.g. uown.co) it
60
+ # prints some output to stdout, this is why we override $stdout for
61
+ # the duration of the run.
62
+ def hide_output
63
+ keep_stdout = $stdout
64
+ keep_stderr = $stderr
65
+ $stdout = StringIO.new
66
+ $stderr = StringIO.new
67
+ yield
68
+ ensure
69
+ $stdout = keep_stdout
70
+ $stderr = keep_stderr
71
+ end
72
+ end
73
+ end
@@ -0,0 +1,49 @@
1
+ # All values below are the default values
2
+
3
+ # log level (0-4) 0=DEBUG 1=INFO 2=WARN 3=ERROR 4=FATAL
4
+ log_level: 1
5
+
6
+ # log_color (yes, no, auto)
7
+ # yes = always show log color
8
+ # no = never use colors
9
+ # auto = only use colors when running in an interactive terminal
10
+ log_color: auto
11
+
12
+ # number of levels to crawl, 0 means capture only the root URL
13
+ depth: 1
14
+
15
+ # screenshot width in pixels
16
+ width: 1280
17
+
18
+ # screenshot height in pixels, 0 means the entire height
19
+ height: 0
20
+
21
+ # number of seconds to consider the page cache and its screenshot fresh
22
+ cache_life: 86400
23
+
24
+ # where to store the HTML page cache
25
+ cache_dir: cache
26
+
27
+ # where to store screenshots
28
+ snaps_dir: snaps
29
+
30
+ # screenshot filename template, where '%{url}' will be replaced with a
31
+ # slug version of the URL (no need to include the .png extension)
32
+ name_template: '%{url}'
33
+
34
+ # urls not matching this regular expression will be ignored
35
+ url_whitelist:
36
+
37
+ # urls matching this regular expression will be ignored
38
+ url_blacklist:
39
+
40
+ # take a screenshot of this CSS selector only
41
+ css_selector:
42
+
43
+ # when true, ignore SSL related errors
44
+ skip_ssl_verification: false
45
+
46
+ # set to any number of seconds to wait for the page to load before taking
47
+ # a screenshot, leave empty to not wait at all (only needed for pages with
48
+ # animations or other post-load events).
49
+ screenshot_delay:
@@ -0,0 +1,26 @@
1
+ Snapcrawl
2
+
3
+ Usage:
4
+ snapcrawl URL [--config FILE] [SETTINGS...]
5
+ snapcrawl -h | --help
6
+ snapcrawl -v | --version
7
+
8
+ Options:
9
+ -c, --config FILE
10
+ Path to config file, with or without the .yml extension.
11
+ A sample file will be created if not found.
12
+ The default filename is 'snapcrawl.yml'.
13
+
14
+ -h, --help
15
+ Show this screen
16
+
17
+ -v, --version
18
+ Show version number
19
+
20
+ Settings:
21
+ Provide any of the options available in the config as 'key=value'.
22
+
23
+ Examples:
24
+ snapcrawl example.com
25
+ snapcrawl example.com --config simple
26
+ snapcrawl example.com depth=1 log_level=2 width=768
@@ -0,0 +1,3 @@
1
+ module Snapcrawl
2
+ VERSION = '0.5.4'
3
+ end
@@ -0,0 +1,20 @@
1
+ require 'snapcrawl/version'
2
+ require 'snapcrawl/exceptions'
3
+ require 'snapcrawl/refinements/pair_split'
4
+ require 'snapcrawl/refinements/string_refinements'
5
+ require 'snapcrawl/log_helpers'
6
+ require 'snapcrawl/pretty_logger'
7
+ require 'snapcrawl/dependencies'
8
+ require 'snapcrawl/config'
9
+ require 'snapcrawl/screenshot'
10
+ require 'snapcrawl/page'
11
+ require 'snapcrawl/crawler'
12
+ require 'snapcrawl/cli'
13
+
14
+ if ENV['BYEBUG']
15
+ require 'byebug'
16
+ require 'lp'
17
+ end
18
+
19
+ Snapcrawl::Config.load
20
+ $logger = Snapcrawl::PrettyLogger.new
@@ -0,0 +1,27 @@
1
+ lib = File.expand_path('lib', __dir__)
2
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
3
+ require 'snapcrawl/version'
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = 'snapcrawl'
7
+ s.version = Snapcrawl::VERSION
8
+ s.summary = 'Crawl a website and take screenshots (CLI + Library)'
9
+ s.description = 'Snapcrawl is a command line utility for crawling a website and saving screenshots.'
10
+ s.authors = ['Danny Ben Shitrit']
11
+ s.email = 'db@dannyben.com'
12
+ s.files = Dir['README.md', 'lib/**/*']
13
+ s.executables = ['snapcrawl']
14
+ s.homepage = 'https://github.com/DannyBen/snapcrawl'
15
+ s.license = 'MIT'
16
+ s.required_ruby_version = '>= 3.0'
17
+
18
+ s.add_runtime_dependency 'addressable', '~> 2.7'
19
+ s.add_runtime_dependency 'colsole', '>= 0.8.1', '< 2'
20
+ s.add_runtime_dependency 'docopt', '~> 0.6'
21
+ s.add_runtime_dependency 'httparty', '~> 0.21'
22
+ s.add_runtime_dependency 'lightly', '~> 0.3'
23
+ s.add_runtime_dependency 'nokogiri', '~> 1.10'
24
+ s.add_runtime_dependency 'sting', '~> 0.4'
25
+ s.add_runtime_dependency 'webshot', '~> 0.1'
26
+ s.metadata['rubygems_mfa_required'] = 'true'
27
+ end
@@ -0,0 +1,41 @@
1
+ # All values below are the default values
2
+
3
+ # log level (0-4) 0=DEBUG 1=INFO 2=WARN 3=ERROR 4=FATAL
4
+ log_level: 1
5
+
6
+ # log_color (yes, no, auto)
7
+ # yes = always show log color
8
+ # no = never use colors
9
+ # auto = only use colors when running in an interactive terminal
10
+ log_color: auto
11
+
12
+ # number of levels to crawl, 0 means capture only the root URL
13
+ depth: 1
14
+
15
+ # screenshot width in pixels
16
+ width: 1280
17
+
18
+ # screenshot height in pixels, 0 means the entire height
19
+ height: 0
20
+
21
+ # number of seconds to consider the page cache and its screenshot fresh
22
+ cache_life: 86400
23
+
24
+ # where to store the HTML page cache
25
+ cache_dir: cache
26
+
27
+ # where to store screenshots
28
+ snaps_dir: snaps
29
+
30
+ # screenshot filename template, where '%{url}' will be replaced with a
31
+ # slug version of the URL (no need to include the .png extension)
32
+ name_template: '%{url}'
33
+
34
+ # urls not matching this regular expression will be ignored
35
+ url_whitelist:
36
+
37
+ # urls matching this regular expression will be ignored
38
+ url_blacklist:
39
+
40
+ # take a screenshot of this CSS selector only
41
+ css_selector: