powerdlz23 1.2.3 → 1.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (207) hide show
  1. package/Spider/README.md +19 -0
  2. package/Spider/domain.py +18 -0
  3. package/Spider/general.py +51 -0
  4. package/Spider/link_finder.py +25 -0
  5. package/Spider/main.py +50 -0
  6. package/Spider/spider.py +74 -0
  7. package/crawler/.formatter.exs +5 -0
  8. package/crawler/.github/workflows/ci.yml +29 -0
  9. package/crawler/.recode.exs +33 -0
  10. package/crawler/.tool-versions +2 -0
  11. package/crawler/CHANGELOG.md +82 -0
  12. package/crawler/README.md +198 -0
  13. package/crawler/architecture.svg +4 -0
  14. package/crawler/config/config.exs +9 -0
  15. package/crawler/config/dev.exs +5 -0
  16. package/crawler/config/test.exs +5 -0
  17. package/crawler/examples/google_search/scraper.ex +37 -0
  18. package/crawler/examples/google_search/url_filter.ex +11 -0
  19. package/crawler/examples/google_search.ex +77 -0
  20. package/crawler/lib/crawler/dispatcher/worker.ex +14 -0
  21. package/crawler/lib/crawler/dispatcher.ex +20 -0
  22. package/crawler/lib/crawler/fetcher/header_preparer.ex +60 -0
  23. package/crawler/lib/crawler/fetcher/modifier.ex +45 -0
  24. package/crawler/lib/crawler/fetcher/policer.ex +77 -0
  25. package/crawler/lib/crawler/fetcher/recorder.ex +55 -0
  26. package/crawler/lib/crawler/fetcher/requester.ex +32 -0
  27. package/crawler/lib/crawler/fetcher/retrier.ex +43 -0
  28. package/crawler/lib/crawler/fetcher/url_filter.ex +26 -0
  29. package/crawler/lib/crawler/fetcher.ex +81 -0
  30. package/crawler/lib/crawler/http.ex +7 -0
  31. package/crawler/lib/crawler/linker/path_builder.ex +71 -0
  32. package/crawler/lib/crawler/linker/path_expander.ex +59 -0
  33. package/crawler/lib/crawler/linker/path_finder.ex +106 -0
  34. package/crawler/lib/crawler/linker/path_offliner.ex +59 -0
  35. package/crawler/lib/crawler/linker/path_prefixer.ex +46 -0
  36. package/crawler/lib/crawler/linker.ex +173 -0
  37. package/crawler/lib/crawler/options.ex +127 -0
  38. package/crawler/lib/crawler/parser/css_parser.ex +37 -0
  39. package/crawler/lib/crawler/parser/guarder.ex +38 -0
  40. package/crawler/lib/crawler/parser/html_parser.ex +41 -0
  41. package/crawler/lib/crawler/parser/link_parser/link_expander.ex +32 -0
  42. package/crawler/lib/crawler/parser/link_parser.ex +50 -0
  43. package/crawler/lib/crawler/parser.ex +122 -0
  44. package/crawler/lib/crawler/queue_handler.ex +45 -0
  45. package/crawler/lib/crawler/scraper.ex +28 -0
  46. package/crawler/lib/crawler/snapper/dir_maker.ex +45 -0
  47. package/crawler/lib/crawler/snapper/link_replacer.ex +95 -0
  48. package/crawler/lib/crawler/snapper.ex +82 -0
  49. package/crawler/lib/crawler/store/counter.ex +19 -0
  50. package/crawler/lib/crawler/store/page.ex +7 -0
  51. package/crawler/lib/crawler/store.ex +87 -0
  52. package/crawler/lib/crawler/worker.ex +62 -0
  53. package/crawler/lib/crawler.ex +91 -0
  54. package/crawler/mix.exs +78 -0
  55. package/crawler/mix.lock +40 -0
  56. package/crawler/test/fixtures/introducing-elixir.jpg +0 -0
  57. package/crawler/test/integration_test.exs +135 -0
  58. package/crawler/test/lib/crawler/dispatcher/worker_test.exs +7 -0
  59. package/crawler/test/lib/crawler/dispatcher_test.exs +5 -0
  60. package/crawler/test/lib/crawler/fetcher/header_preparer_test.exs +7 -0
  61. package/crawler/test/lib/crawler/fetcher/policer_test.exs +71 -0
  62. package/crawler/test/lib/crawler/fetcher/recorder_test.exs +9 -0
  63. package/crawler/test/lib/crawler/fetcher/requester_test.exs +9 -0
  64. package/crawler/test/lib/crawler/fetcher/retrier_test.exs +7 -0
  65. package/crawler/test/lib/crawler/fetcher/url_filter_test.exs +7 -0
  66. package/crawler/test/lib/crawler/fetcher_test.exs +153 -0
  67. package/crawler/test/lib/crawler/http_test.exs +47 -0
  68. package/crawler/test/lib/crawler/linker/path_builder_test.exs +7 -0
  69. package/crawler/test/lib/crawler/linker/path_expander_test.exs +7 -0
  70. package/crawler/test/lib/crawler/linker/path_finder_test.exs +7 -0
  71. package/crawler/test/lib/crawler/linker/path_offliner_test.exs +7 -0
  72. package/crawler/test/lib/crawler/linker/path_prefixer_test.exs +7 -0
  73. package/crawler/test/lib/crawler/linker_test.exs +7 -0
  74. package/crawler/test/lib/crawler/options_test.exs +7 -0
  75. package/crawler/test/lib/crawler/parser/css_parser_test.exs +7 -0
  76. package/crawler/test/lib/crawler/parser/guarder_test.exs +7 -0
  77. package/crawler/test/lib/crawler/parser/html_parser_test.exs +7 -0
  78. package/crawler/test/lib/crawler/parser/link_parser/link_expander_test.exs +7 -0
  79. package/crawler/test/lib/crawler/parser/link_parser_test.exs +7 -0
  80. package/crawler/test/lib/crawler/parser_test.exs +8 -0
  81. package/crawler/test/lib/crawler/queue_handler_test.exs +7 -0
  82. package/crawler/test/lib/crawler/scraper_test.exs +7 -0
  83. package/crawler/test/lib/crawler/snapper/dir_maker_test.exs +7 -0
  84. package/crawler/test/lib/crawler/snapper/link_replacer_test.exs +7 -0
  85. package/crawler/test/lib/crawler/snapper_test.exs +9 -0
  86. package/crawler/test/lib/crawler/worker_test.exs +5 -0
  87. package/crawler/test/lib/crawler_test.exs +295 -0
  88. package/crawler/test/support/test_case.ex +24 -0
  89. package/crawler/test/support/test_helpers.ex +28 -0
  90. package/crawler/test/test_helper.exs +7 -0
  91. package/grell/.rspec +2 -0
  92. package/grell/.travis.yml +28 -0
  93. package/grell/CHANGELOG.md +111 -0
  94. package/grell/Gemfile +7 -0
  95. package/grell/LICENSE.txt +22 -0
  96. package/grell/README.md +213 -0
  97. package/grell/Rakefile +2 -0
  98. package/grell/grell.gemspec +36 -0
  99. package/grell/lib/grell/capybara_driver.rb +44 -0
  100. package/grell/lib/grell/crawler.rb +83 -0
  101. package/grell/lib/grell/crawler_manager.rb +84 -0
  102. package/grell/lib/grell/grell_logger.rb +10 -0
  103. package/grell/lib/grell/page.rb +275 -0
  104. package/grell/lib/grell/page_collection.rb +62 -0
  105. package/grell/lib/grell/rawpage.rb +62 -0
  106. package/grell/lib/grell/reader.rb +18 -0
  107. package/grell/lib/grell/version.rb +3 -0
  108. package/grell/lib/grell.rb +11 -0
  109. package/grell/spec/lib/capybara_driver_spec.rb +38 -0
  110. package/grell/spec/lib/crawler_manager_spec.rb +174 -0
  111. package/grell/spec/lib/crawler_spec.rb +361 -0
  112. package/grell/spec/lib/page_collection_spec.rb +159 -0
  113. package/grell/spec/lib/page_spec.rb +418 -0
  114. package/grell/spec/lib/reader_spec.rb +43 -0
  115. package/grell/spec/spec_helper.rb +66 -0
  116. package/heartmagic/config.py +1 -0
  117. package/heartmagic/heart.py +3 -0
  118. package/heartmagic/pytransform/__init__.py +483 -0
  119. package/heartmagic/pytransform/_pytransform.dll +0 -0
  120. package/heartmagic/pytransform/_pytransform.so +0 -0
  121. package/httpStatusCode/README.md +2 -0
  122. package/httpStatusCode/httpStatusCode.js +4 -0
  123. package/httpStatusCode/reasonPhrases.js +344 -0
  124. package/httpStatusCode/statusCodes.js +344 -0
  125. package/package.json +1 -1
  126. package/rubyretriever/.rspec +2 -0
  127. package/rubyretriever/.travis.yml +7 -0
  128. package/rubyretriever/Gemfile +3 -0
  129. package/rubyretriever/Gemfile.lock +64 -0
  130. package/rubyretriever/LICENSE +20 -0
  131. package/rubyretriever/Rakefile +7 -0
  132. package/rubyretriever/bin/rr +79 -0
  133. package/rubyretriever/lib/retriever/cli.rb +25 -0
  134. package/rubyretriever/lib/retriever/core_ext.rb +13 -0
  135. package/rubyretriever/lib/retriever/fetch.rb +268 -0
  136. package/rubyretriever/lib/retriever/fetchfiles.rb +71 -0
  137. package/rubyretriever/lib/retriever/fetchseo.rb +18 -0
  138. package/rubyretriever/lib/retriever/fetchsitemap.rb +43 -0
  139. package/rubyretriever/lib/retriever/link.rb +47 -0
  140. package/rubyretriever/lib/retriever/openuri_redirect_patch.rb +8 -0
  141. package/rubyretriever/lib/retriever/page.rb +104 -0
  142. package/rubyretriever/lib/retriever/page_iterator.rb +21 -0
  143. package/rubyretriever/lib/retriever/target.rb +47 -0
  144. package/rubyretriever/lib/retriever/version.rb +4 -0
  145. package/rubyretriever/lib/retriever.rb +15 -0
  146. package/rubyretriever/readme.md +166 -0
  147. package/rubyretriever/rubyretriever.gemspec +41 -0
  148. package/rubyretriever/spec/link_spec.rb +77 -0
  149. package/rubyretriever/spec/page_spec.rb +94 -0
  150. package/rubyretriever/spec/retriever_spec.rb +84 -0
  151. package/rubyretriever/spec/spec_helper.rb +17 -0
  152. package/rubyretriever/spec/target_spec.rb +55 -0
  153. package/snapcrawl/.changelog.old.md +157 -0
  154. package/snapcrawl/.gitattributes +1 -0
  155. package/snapcrawl/.github/workflows/test.yml +41 -0
  156. package/snapcrawl/.rspec +3 -0
  157. package/snapcrawl/.rubocop.yml +23 -0
  158. package/snapcrawl/CHANGELOG.md +182 -0
  159. package/snapcrawl/Gemfile +15 -0
  160. package/snapcrawl/LICENSE +21 -0
  161. package/snapcrawl/README.md +135 -0
  162. package/snapcrawl/Runfile +35 -0
  163. package/snapcrawl/bin/snapcrawl +25 -0
  164. package/snapcrawl/lib/snapcrawl/cli.rb +52 -0
  165. package/snapcrawl/lib/snapcrawl/config.rb +60 -0
  166. package/snapcrawl/lib/snapcrawl/crawler.rb +98 -0
  167. package/snapcrawl/lib/snapcrawl/dependencies.rb +21 -0
  168. package/snapcrawl/lib/snapcrawl/exceptions.rb +5 -0
  169. package/snapcrawl/lib/snapcrawl/log_helpers.rb +36 -0
  170. package/snapcrawl/lib/snapcrawl/page.rb +118 -0
  171. package/snapcrawl/lib/snapcrawl/pretty_logger.rb +11 -0
  172. package/snapcrawl/lib/snapcrawl/refinements/pair_split.rb +26 -0
  173. package/snapcrawl/lib/snapcrawl/refinements/string_refinements.rb +13 -0
  174. package/snapcrawl/lib/snapcrawl/screenshot.rb +73 -0
  175. package/snapcrawl/lib/snapcrawl/templates/config.yml +49 -0
  176. package/snapcrawl/lib/snapcrawl/templates/docopt.txt +26 -0
  177. package/snapcrawl/lib/snapcrawl/version.rb +3 -0
  178. package/snapcrawl/lib/snapcrawl.rb +20 -0
  179. package/snapcrawl/snapcrawl.gemspec +27 -0
  180. package/snapcrawl/snapcrawl.yml +41 -0
  181. package/snapcrawl/spec/README.md +16 -0
  182. package/snapcrawl/spec/approvals/bin/help +26 -0
  183. package/snapcrawl/spec/approvals/bin/usage +4 -0
  184. package/snapcrawl/spec/approvals/cli/usage +4 -0
  185. package/snapcrawl/spec/approvals/config/defaults +15 -0
  186. package/snapcrawl/spec/approvals/config/minimal +15 -0
  187. package/snapcrawl/spec/approvals/integration/blacklist +14 -0
  188. package/snapcrawl/spec/approvals/integration/default-config +14 -0
  189. package/snapcrawl/spec/approvals/integration/depth-0 +6 -0
  190. package/snapcrawl/spec/approvals/integration/depth-3 +6 -0
  191. package/snapcrawl/spec/approvals/integration/log-color-no +6 -0
  192. package/snapcrawl/spec/approvals/integration/screenshot-error +3 -0
  193. package/snapcrawl/spec/approvals/integration/whitelist +14 -0
  194. package/snapcrawl/spec/approvals/models/pretty_logger/colors +1 -0
  195. package/snapcrawl/spec/fixtures/config/minimal.yml +4 -0
  196. package/snapcrawl/spec/server/config.ru +97 -0
  197. package/snapcrawl/spec/snapcrawl/bin_spec.rb +15 -0
  198. package/snapcrawl/spec/snapcrawl/cli_spec.rb +9 -0
  199. package/snapcrawl/spec/snapcrawl/config_spec.rb +26 -0
  200. package/snapcrawl/spec/snapcrawl/integration_spec.rb +65 -0
  201. package/snapcrawl/spec/snapcrawl/page_spec.rb +89 -0
  202. package/snapcrawl/spec/snapcrawl/pretty_logger_spec.rb +19 -0
  203. package/snapcrawl/spec/snapcrawl/refinements/pair_split_spec.rb +27 -0
  204. package/snapcrawl/spec/snapcrawl/refinements/string_refinements_spec.rb +29 -0
  205. package/snapcrawl/spec/snapcrawl/screenshot_spec.rb +62 -0
  206. package/snapcrawl/spec/spec_helper.rb +22 -0
  207. package/snapcrawl/spec/spec_mixin.rb +10 -0
@@ -0,0 +1,166 @@
1
+ RubyRetriever
2
+ ==============
3
+ [![Gem Version](https://badge.fury.io/rb/rubyretriever.svg)](http://badge.fury.io/rb/rubyretriever) [![Build Status](https://travis-ci.org/joenorton/rubyretriever.svg?branch=master)](https://travis-ci.org/joenorton/rubyretriever)
4
+
5
+ [RubyRetriever Webpage](https://norton.io/projects/rubyretriever/)
6
+
7
+ By Joe Norton
8
+
9
+ RubyRetriever is a Web Crawler, Scraper & File Harvester. Available as a command-line executable and as a crawling framework.
10
+
11
+ RubyRetriever (RR) uses asynchronous HTTP requests via [Eventmachine](https://github.com/eventmachine/eventmachine) & [Synchrony](https://github.com/igrigorik/em-synchrony) to crawl webpages *very quickly*. RR also uses a Ruby implementation of the [bloomfilter](https://github.com/igrigorik/bloomfilter-rb) in order to keep track of pages it has already crawled in a memory efficient manner.
12
+
13
+ **v1.4.3 Update (3/24/2016)** - Fixes problem with file downloads that had query strings, the filename was being saved with the querystrings still attached. No more.
14
+
15
+ **v1.4.2 Update (3/24/2016)** - Fixes problem with named anchors (divs) being counted as links.
16
+
17
+ **v1.4.1 Update (3/24/2016)** - Update gemfile & external dependency versioning
18
+
19
+ **v1.4.0 Update (3/24/2016)** - Several bug fixes.
20
+
21
+
22
+ Mission
23
+ -------
24
+ RubyRetriever aims to be the best command-line crawling and scraping package written in Ruby and a replacement for paid software such as Screaming Frog SEO Spider.
25
+
26
+
27
+ Roadmap?
28
+ Not sure. Feel free to offer your thoughts.
29
+
30
+ Some Potential Ideas:
31
+ * 'freeroam mode' - to go on cruising the net endlessly in fileharvest mode
32
+ * 'dead-link finder' mode - collects links returning 404, or other error msgs
33
+ * 'validate robots.txt' mode - outputs the bot-exposed sitemap of your site
34
+ * more sophisticated SEO analysis? replace screaming frog? this would include checks for canonical URL, maybe some keyword density checks, content length checks, etc.
35
+
36
+ Features
37
+ --------
38
+ * Asynchronous HTTP Requests thru EM & Synchrony
39
+ * Bloom filter for tracking visited pages
40
+ * Supports HTTPS
41
+ * Follows 301 redirects (if to same host)
42
+ * 3 CLI modes
43
+ * Sitemap - Find all links on a website, output a valid XML sitemap, or just a CSV
44
+ * File Harvest - find all files linked to on a website, option to autodownload
45
+ * SEO - collect important SEO info from every page, output to a CSV (or STDOUT)
46
+ * Run a Custom Block on a Per-Page basis (PageIterator)
47
+
48
+ Use cases
49
+ ---------
50
+ **As an Executable**
51
+ With a single command at the terminal, RR can:
52
+ 1. Crawl your website and output a *valid XML sitemap* based on what it found.
53
+ 2. Crawl a target website and *download all files of a given filetype*.
54
+ 3. Crawl a target website, *collect important SEO information* such as page titles, meta descriptions and h1 tags, and write it to CSV.
55
+
56
+ **Used in Custom scripts**
57
+ As of version 1.3.0, with the PageIterator class you can pass a custom block that will get run against each page during a crawl, and collect the results in an array. This means you can define for yourself whatever it is you want to collect from each page during the crawl.
58
+
59
+ Help & Forks Welcome!
60
+
61
+ Getting started
62
+ -----------
63
+ Install the gem
64
+ ```sh
65
+ $ gem install rubyretriever
66
+ ```
67
+
68
+
69
+ Using the Executable
70
+ --------------------
71
+ **Example: Sitemap mode**
72
+ ```sh
73
+ $ rr --sitemap CSV --progress --limit 10 http://www.cnet.com
74
+ ```
75
+ OR -- SAME COMMAND
76
+ ```sh
77
+ $ rr -s csv -p -l 10 http://www.cnet.com
78
+ ```
79
+
80
+ This would map http://www.cnet.com until it crawled a max of 10 pages, then write the results to a CSV named cnet. Optionally, you could also use the format XML and RR would output the same URL list into a valid XML sitemap that could be submitted to Google.
81
+
82
+ **Example: File Harvesting mode**
83
+ ```sh
84
+ $ rr --files txt --verbose --limit 1 http://textfiles.com/programming/
85
+ ```
86
+ OR -- SAME COMMAND
87
+ ```sh
88
+ $ rr -f txt -v -l 1 http://textfiles.com/programming/
89
+ ```
90
+
91
+ This would crawl http://textfiles.com/programming/ looking for txt files for only a single page, then write out a list of filepaths to txt files to the terminal. Optionally, you could have the script autodownload all the files by adding the -a/--auto flag.
92
+
93
+ **Example: SEO mode**
94
+ ```sh
95
+ $ rr --seo --progress --limit 10 --out cnet-seo http://www.cnet.com
96
+ ```
97
+ OR -- SAME COMMAND
98
+ ```sh
99
+ $ rr -e -p -l 10 -o cnet-seo http://www.cnet.com
100
+ ```
101
+
102
+ This would go to http://www.cnet.com and crawl a max of 10 pages, during which it would collect the SEO fields on those pages - this currently means [url, page title, meta description, h1 text, h2 text]. It would then write the fields to a csv named cnet-seo.
103
+
104
+
105
+ command-line arguments
106
+ -----------------------
107
+ Usage: rr [MODE FLAG] [OPTIONS] Target_URL
108
+
109
+ Where MODE FLAG is required, and is either:
110
+ -s, --sitemap FORMAT (only accepts CSV or XML atm)
111
+ -f, --files FILETYPE
112
+ -e, --seo
113
+
114
+ and OPTIONS is the applicable:
115
+ -o, --out FILENAME *Dump fetch data as CSV*
116
+ -p, --progress *Outputs a progressbar*
117
+ -v, --verbose *Output more information*
118
+ -l, --limit PAGE_LIMIT_# *set a max on the total number of crawled pages*
119
+ -h, --help *Display this screen*
120
+
121
+
122
+ Using as a Library (starting as of version 1.3.0)
123
+ ------------------
124
+
125
+ If you want to collect something, other than that which the executable allows, on a 'per page' basis then you want to use the PageIterator class. Then you can run whatever block you want against each individual page's source code located during the crawl.
126
+
127
+ Sample Script using **PageIterator**
128
+ ```ruby
129
+ require 'retriever'
130
+ opts = {
131
+ 'maxpages' => 1
132
+ }
133
+ t = Retriever::PageIterator.new('http://www.basecamp.com', opts) do |page|
134
+ [page.url, page.title]
135
+ end
136
+ puts t.result.to_s
137
+ ```
138
+
139
+ ```sh
140
+ >> [["http://www.basecamp.com", "Basecamp is everyone’s favorite project management app."]]
141
+ ```
142
+ Available methods on the page iterator:
143
+ * **#url** - returns full URL of current page
144
+ * **#source** - returns raw page source code
145
+ * **#title** - returns html decoded verson of curent page title
146
+ * **#desc** - returns html decoded verson of curent page meta description
147
+ * **#h1** - returns html decoded verson of current page's h1 tag
148
+ * **#h2** - returns html decoded verson of current page's h2 tag
149
+ * **#links** - returns array of all links on the page
150
+ * **#parse_internal** - returns array of current page's internal (same host) links
151
+ * **#parse_internal_visitable** - returns #parse_internal plus added filtering of only links that are visitable
152
+ * **#parse_seo** - returns array of current page's html decoded title, desc, h1 and h2
153
+ * **#parse_files** - returns array of downloaded files of type supplied as RR options (fileharvest options)
154
+
155
+
156
+ Current Requirements
157
+ ------------
158
+ em-synchrony
159
+ ruby-progressbar
160
+ bloomfilter-rb
161
+ addressable
162
+ htmlentities
163
+
164
+ License
165
+ -------
166
+ See included 'LICENSE' file. It's the MIT license.
@@ -0,0 +1,41 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'retriever/version'
5
+
6
+ Gem::Specification.new do |s|
7
+ s.required_ruby_version = ['>= 2.0', '< 2.3']
8
+ s.platform = Gem::Platform::RUBY
9
+ s.version = Retriever::VERSION
10
+ s.name = 'rubyretriever'
11
+ s.date = '2016-04-11'
12
+ s.summary = 'Ruby Web Crawler & File Harvester'
13
+ s.description = 'Asynchronous web crawler, scraper and file harvester'
14
+ s.authors = ['Joe Norton']
15
+ s.email = ['joe@norton.io']
16
+ s.homepage = 'http://norton.io/rubyretriever/'
17
+ s.license = 'MIT'
18
+ # If you need to check in files that aren't .rb files, add them here
19
+ s.files = Dir['{lib}/**/*.rb', 'bin/*', 'LICENSE', '*.md',
20
+ '{spec}/*.rb']
21
+ s.require_path = 'lib'
22
+ s.rubyforge_project = 'rubyretriever'
23
+
24
+ # If you need an executable, add it here
25
+ s.executables = ['rr']
26
+ s.required_rubygems_version = '>= 1.3.6'
27
+
28
+ # If you have other dependencies, add them here
29
+ s.add_runtime_dependency 'em-synchrony'
30
+ s.add_runtime_dependency 'em-http-request'
31
+ s.add_runtime_dependency 'ruby-progressbar'
32
+ s.add_runtime_dependency 'bloomfilter-rb'
33
+ s.add_runtime_dependency 'addressable'
34
+ s.add_runtime_dependency 'htmlentities'
35
+ s.add_runtime_dependency 'nokogiri'
36
+
37
+ s.add_development_dependency 'bundler', '~> 1.6'
38
+ s.add_development_dependency 'rake', '~> 10.3'
39
+ s.add_development_dependency 'rspec', '~> 2.14'
40
+ s.add_development_dependency 'pry'
41
+ end
@@ -0,0 +1,77 @@
1
+ require 'retriever'
2
+
3
+ describe 'Link' do
4
+
5
+ t = Retriever::Target.new('http://www.cnet.com/reviews/')
6
+ let(:links) do
7
+ Retriever::Page.new('http://www.cnet.com/reviews/', @source, t).links
8
+ end
9
+
10
+ it 'collects links in anchor tags' do
11
+ @source = (<<SOURCE).strip
12
+ <a href='http://www.cnet.com/download.exe'>download</a>
13
+ SOURCE
14
+
15
+ expect(links).to include('http://www.cnet.com/download.exe')
16
+ end
17
+
18
+ it 'collects links in link tags' do
19
+ @source = (<<SOURCE).strip
20
+ <link rel='stylesheet' id='gforms_reset_css-css' href='http://www.cnet.com/wp-content/plugins/gravityforms/css/formreset.css?ver=1.7.12' type='text/css' media='all' />
21
+ SOURCE
22
+
23
+ expect(links[0]).to include('formreset.css?ver=1.7.12')
24
+ end
25
+
26
+ it 'does not collect bare links (ones not in an href)' do
27
+ @source = (<<SOURCE).strip
28
+ http://www.google.com
29
+ SOURCE
30
+
31
+ expect(links).to_not include('http://www.google.com')
32
+ end
33
+
34
+ it 'collects only unique href links on the page' do
35
+ @source = (<<SOURCE).strip
36
+ <a href='http://www.cnet.com/products/gadgets'>gadgets</a>
37
+ <a href='http://www.cnet.com/products/gadgets'>gadgets2</a>
38
+ SOURCE
39
+
40
+ expect(links.size).to eq(1)
41
+ end
42
+
43
+ it 'adds a protocol to urls missing them (www.)' do
44
+ @source = (<<SOURCE).strip
45
+ <a href='www.cnet.com/download.exe'>download</a>
46
+ SOURCE
47
+
48
+ expect(links).to include('http://www.cnet.com/download.exe')
49
+ end
50
+
51
+ it "doesn\'t care about any extra attributes on the anchor tag" do
52
+ @source = (<<SOURCE).strip
53
+ <a href='http://www.cnet.com/products/gadgets/'>gadgets </a>
54
+ <a href='http://www.cnet.com/products/gadgets/' data-vanity-rewritten='true'>
55
+ </a>
56
+ SOURCE
57
+
58
+ expect(links.size).to eq(1)
59
+ end
60
+
61
+ it 'returns relative urls with full path based on hostname' do
62
+ @source = (<<SOURCE).strip
63
+ <a href='/test.html'>test</a>
64
+ <a href='cpage_18'>about</a>
65
+ SOURCE
66
+
67
+ expect(links).to include('http://www.cnet.com/test.html',
68
+ 'http://www.cnet.com/reviews/cpage_18')
69
+ end
70
+ it 'collects files even when query strings exist' do
71
+ @source = (<<SOURCE).strip
72
+ <a href='http://mises.org/system/tdf/Robert%20Nozick%20and%20Murray%20Rothbard%20David%20Gordon.mp3?file=1&amp;type=audio' type='audio/mpeg; length=22217599' title='Robert Nozick and Murray Rothbard David Gordon.mp3'>Download audio file</a></span></div>
73
+ SOURCE
74
+
75
+ expect(links).to include('http://mises.org/system/tdf/Robert%20Nozick%20and%20Murray%20Rothbard%20David%20Gordon.mp3?file=1&amp;type=audio')
76
+ end
77
+ end
@@ -0,0 +1,94 @@
1
+ require 'retriever/page'
2
+ require 'retriever/fetch'
3
+
4
+ t = Retriever::Target.new('http://www.cnet.com/reviews/', /\.exe\z/)
5
+
6
+ describe 'Page' do
7
+ let(:common_source) do
8
+ <<-SOURCE
9
+ <title>test</title>
10
+ <a href='www.cnet.com/download.exe'>download</a>
11
+ <a href='/test.html'>test</a>
12
+ <a href='http://www.cnet.com/products/gadgets/' data-vanity-rewritten='true'>
13
+ </a>
14
+ <a href='http://www.cnet.com/products/gadgets/' id='gadgets-link'>gadgets </a>
15
+ <a href='http://www.yahoo.com/test/'>yahoo</a>"
16
+ <meta name='description' content="test2 ">
17
+ <h1>test 3</h1>
18
+ <h2> test 4 </h2>
19
+ SOURCE
20
+ end
21
+
22
+ describe '#url' do
23
+ let(:page) { Retriever::Page.new('http://www.cnet.com/', common_source, t) }
24
+ it 'returns current page URL' do
25
+ expect(page.url).to eq('http://www.cnet.com/')
26
+ end
27
+ end
28
+
29
+ describe '#links' do
30
+ let(:source) { "<a href='/profile/'>profile</a><a href='#top'>top</a> <link rel='stylesheet' id='gforms_reset_css-css' href='http://www.cnet.com/wp-content/plugins/gravityforms/css/formreset.css?ver=1.7.12' type='text/css' media='all' />" }
31
+ let(:page) { Retriever::Page.new('http://www.cnet.com/', source, t) }
32
+ it 'collects all unique href links on the page, skips div anchors' do
33
+ expect(page.links.size).to eq(2)
34
+ end
35
+ end
36
+
37
+ describe '#parse_internal' do
38
+ let(:page) { Retriever::Page.new('http://www.cnet.com/', common_source, t) }
39
+ let(:links) { page.parse_internal }
40
+ it 'filters links by host' do
41
+ expect(links.size).to eq(3)
42
+ end
43
+ end
44
+
45
+ describe '#parse_internal_visitable' do
46
+ let(:source) { "<a href='/profile/'>profile</a> <link rel='stylesheet' id='gforms_reset_css-css' href='http://www.cnet.com/wp-content/plugins/gravityforms/css/formreset.css?ver=1.7.12' type='text/css' media='all' />" }
47
+ let(:page) { Retriever::Page.new('http://www.cnet.com/', source, t) }
48
+ let(:links) { page.parse_internal_visitable }
49
+ it "filters out 'unvisitable' URLS like JS, Stylesheets, Images" do
50
+ expect(links.size).to eq(1)
51
+ end
52
+ end
53
+
54
+ describe '#parse_files' do
55
+ let(:page) { Retriever::Page.new('http://www.cnet.com/', common_source, t) }
56
+ let(:files) { page.parse_files(page.parse_internal) }
57
+ it 'filters links by filetype' do
58
+ expect(files.size).to eq(1)
59
+ end
60
+ end
61
+
62
+ describe '#parse_by_css' do
63
+ let(:page) { Retriever::Page.new('http://www.cnet.com/', common_source, t) }
64
+
65
+ it 'returns the text from the received css selector' do
66
+ expect(page.parse_by_css('#gadgets-link')).to eq('gadgets ')
67
+ end
68
+ end
69
+
70
+ describe '#title' do
71
+ let(:page) { Retriever::Page.new('http://www.cnet.com/', common_source, t) }
72
+ it 'returns page title' do
73
+ expect(page.title).to eq('test')
74
+ end
75
+ end
76
+ describe '#desc' do
77
+ let(:page) { Retriever::Page.new('http://www.cnet.com/', common_source, t) }
78
+ it 'returns meta description' do
79
+ expect(page.desc).to eq('test2 ')
80
+ end
81
+ end
82
+ describe '#h1' do
83
+ let(:page) { Retriever::Page.new('http://www.cnet.com/', common_source, t) }
84
+ it 'returns h1 text' do
85
+ expect(page.h1).to eq('test 3')
86
+ end
87
+ end
88
+ describe '#h2' do
89
+ let(:page) { Retriever::Page.new('http://www.cnet.com/', common_source, t) }
90
+ it 'returns h2 text' do
91
+ expect(page.h2).to eq(' test 4 ')
92
+ end
93
+ end
94
+ end
@@ -0,0 +1,84 @@
1
+ require 'retriever'
2
+ require 'retriever/fetchfiles'
3
+
4
+ describe 'Fetch' do
5
+ let(:r) do
6
+ Retriever::Fetch.new('http://www.yahoo.com', {})
7
+ end
8
+ describe '#good_response?' do
9
+
10
+ let(:resp) do
11
+ {}
12
+ end
13
+
14
+ let(:nil_response) do
15
+ r.good_response?(nil, 'http://www.yahoo.com')
16
+ end
17
+
18
+ let(:unsuccessful_resp) do
19
+ resp.stub(:response_header).and_return(resp)
20
+ resp.stub(:redirection?).and_return(false)
21
+ resp.stub(:successful?).and_return(false)
22
+ resp.stub(:server_error?).and_return(false)
23
+ resp.stub(:client_error?).and_return(false)
24
+ r.good_response?(resp, 'http://www.yahoo.com')
25
+ end
26
+
27
+ let(:redir_resp) do
28
+ resp.stub(:response_header).and_return(resp)
29
+ resp.stub(:redirection?).and_return(true)
30
+ resp.stub(:location).and_return('http://www.google.com')
31
+ r.good_response?(resp, 'http://www.yahoo.com')
32
+ end
33
+
34
+ let(:bad_content_type_resp) do
35
+ resp.stub(:response_header).and_return(resp)
36
+ resp.stub(:redirection?).and_return(false)
37
+ resp.stub(:successful?).and_return(true)
38
+ resp['CONTENT_TYPE'] = 'image/jpeg'
39
+ r.good_response?(resp, 'http://www.yahoo.com')
40
+ end
41
+
42
+ let(:success_resp) do
43
+ resp.stub(:response_header).and_return(resp)
44
+ resp.stub(:redirection?).and_return(false)
45
+ resp.stub(:successful?).and_return(true)
46
+ resp['CONTENT_TYPE'] = 'text/html'
47
+ r.good_response?(resp, 'http://www.yahoo.com')
48
+ end
49
+
50
+ it 'returns false if the response is empty' do
51
+ expect(nil_response).to eq(false)
52
+ end
53
+
54
+ it 'returns false on unsuccessful connection' do
55
+ expect(unsuccessful_resp).to eq(false)
56
+ end
57
+
58
+ it 'returns false on redirecting host' do
59
+ expect(redir_resp).to eq(false)
60
+ end
61
+
62
+ it 'returns false on non-visitable content type' do
63
+ expect(bad_content_type_resp).to eq(false)
64
+ end
65
+
66
+ it 'returns true otherwise' do
67
+ expect(success_resp).to eq(true)
68
+ end
69
+ end
70
+ describe '#filter_out_querystrings' do
71
+ let(:normal_url) do
72
+ r.filter_out_querystrings('http://mises.org/test.mp3')
73
+ end
74
+ let(:query_string_url) do
75
+ r.filter_out_querystrings('http://mises.org/system/tdf/Robert%20Nozick%20and%20Murray%20Rothbard%20David%20Gordon.mp3?file=1&amp;type=audio')
76
+ end
77
+ it 'accepts standard urls' do
78
+ expect(normal_url).to eq('http://mises.org/test.mp3')
79
+ end
80
+ it 'strips query params' do
81
+ expect(query_string_url).to eq('http://mises.org/system/tdf/Robert%20Nozick%20and%20Murray%20Rothbard%20David%20Gordon.mp3')
82
+ end
83
+ end
84
+ end
@@ -0,0 +1,17 @@
1
+ # This file was generated by the `rspec --init` command. Conventionally, all
2
+ # specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
3
+ # Require this file using `require "spec_helper"` to ensure that it is only
4
+ # loaded once.
5
+ #
6
+ # See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
7
+ RSpec.configure do |config|
8
+ config.treat_symbols_as_metadata_keys_with_true_values = true
9
+ config.run_all_when_everything_filtered = true
10
+ config.filter_run :focus
11
+
12
+ # Run specs in random order to surface order dependencies. If you find an
13
+ # order dependency and want to debug it, you can fix the order by providing
14
+ # the seed, which is printed after each run.
15
+ # --seed 1234
16
+ config.order = 'random'
17
+ end
@@ -0,0 +1,55 @@
1
+ require 'retriever'
2
+ require 'open-uri'
3
+
4
+ describe 'Target' do
5
+ let(:t) do
6
+ Retriever::Target.new('http://www.cnet.com/reviews/', /\.exe\z/)
7
+ end
8
+
9
+ it 'creates target var' do
10
+ expect(t.target).to eq('http://www.cnet.com/reviews/')
11
+ end
12
+
13
+ it 'creates host var' do
14
+ expect(t.host).to eq('www.cnet.com')
15
+ end
16
+
17
+ it 'creates host_re var' do
18
+ expect(t.host_re).to eq(/cnet.com/)
19
+ end
20
+
21
+ it 'creates port var (no port specified)' do
22
+ expect(t.port).to be_nil
23
+ end
24
+
25
+ it 'creates port var (with port specified)' do
26
+ expect(Retriever::Target.new('http://www.cnet.com:3000/reviews/', /\.exe\z/).port).to be(3000)
27
+ end
28
+
29
+ it 'creates file_re var (when provided)' do
30
+ expect(t.file_re).to eq(/\.exe\z/)
31
+ end
32
+
33
+ it 'adds protocol to Target URL if none given' do
34
+ expect(Retriever::Target.new('cnet.com').target).to eq('http://cnet.com')
35
+ end
36
+
37
+ it 'fails if given URL has no dot in it' do
38
+ expect { Retriever::Target.new('cnetcom') }.to raise_error
39
+ end
40
+
41
+ describe '#source' do
42
+ let(:redirecting_url) do
43
+ Retriever::Target.new('http://software-by-joe.appspot.com').source
44
+ end
45
+
46
+ it 'opens URL and returns source as String' do
47
+ expect(Retriever::Target.new('http://techcrunch.com/').source.class)
48
+ .to eq(String)
49
+ end
50
+
51
+ it 'fails if target redirects to new host' do
52
+ expect { redirecting_url }.to raise_error
53
+ end
54
+ end
55
+ end