powerdlz23 1.2.3 → 1.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (207) hide show
  1. package/Spider/README.md +19 -0
  2. package/Spider/domain.py +18 -0
  3. package/Spider/general.py +51 -0
  4. package/Spider/link_finder.py +25 -0
  5. package/Spider/main.py +50 -0
  6. package/Spider/spider.py +74 -0
  7. package/crawler/.formatter.exs +5 -0
  8. package/crawler/.github/workflows/ci.yml +29 -0
  9. package/crawler/.recode.exs +33 -0
  10. package/crawler/.tool-versions +2 -0
  11. package/crawler/CHANGELOG.md +82 -0
  12. package/crawler/README.md +198 -0
  13. package/crawler/architecture.svg +4 -0
  14. package/crawler/config/config.exs +9 -0
  15. package/crawler/config/dev.exs +5 -0
  16. package/crawler/config/test.exs +5 -0
  17. package/crawler/examples/google_search/scraper.ex +37 -0
  18. package/crawler/examples/google_search/url_filter.ex +11 -0
  19. package/crawler/examples/google_search.ex +77 -0
  20. package/crawler/lib/crawler/dispatcher/worker.ex +14 -0
  21. package/crawler/lib/crawler/dispatcher.ex +20 -0
  22. package/crawler/lib/crawler/fetcher/header_preparer.ex +60 -0
  23. package/crawler/lib/crawler/fetcher/modifier.ex +45 -0
  24. package/crawler/lib/crawler/fetcher/policer.ex +77 -0
  25. package/crawler/lib/crawler/fetcher/recorder.ex +55 -0
  26. package/crawler/lib/crawler/fetcher/requester.ex +32 -0
  27. package/crawler/lib/crawler/fetcher/retrier.ex +43 -0
  28. package/crawler/lib/crawler/fetcher/url_filter.ex +26 -0
  29. package/crawler/lib/crawler/fetcher.ex +81 -0
  30. package/crawler/lib/crawler/http.ex +7 -0
  31. package/crawler/lib/crawler/linker/path_builder.ex +71 -0
  32. package/crawler/lib/crawler/linker/path_expander.ex +59 -0
  33. package/crawler/lib/crawler/linker/path_finder.ex +106 -0
  34. package/crawler/lib/crawler/linker/path_offliner.ex +59 -0
  35. package/crawler/lib/crawler/linker/path_prefixer.ex +46 -0
  36. package/crawler/lib/crawler/linker.ex +173 -0
  37. package/crawler/lib/crawler/options.ex +127 -0
  38. package/crawler/lib/crawler/parser/css_parser.ex +37 -0
  39. package/crawler/lib/crawler/parser/guarder.ex +38 -0
  40. package/crawler/lib/crawler/parser/html_parser.ex +41 -0
  41. package/crawler/lib/crawler/parser/link_parser/link_expander.ex +32 -0
  42. package/crawler/lib/crawler/parser/link_parser.ex +50 -0
  43. package/crawler/lib/crawler/parser.ex +122 -0
  44. package/crawler/lib/crawler/queue_handler.ex +45 -0
  45. package/crawler/lib/crawler/scraper.ex +28 -0
  46. package/crawler/lib/crawler/snapper/dir_maker.ex +45 -0
  47. package/crawler/lib/crawler/snapper/link_replacer.ex +95 -0
  48. package/crawler/lib/crawler/snapper.ex +82 -0
  49. package/crawler/lib/crawler/store/counter.ex +19 -0
  50. package/crawler/lib/crawler/store/page.ex +7 -0
  51. package/crawler/lib/crawler/store.ex +87 -0
  52. package/crawler/lib/crawler/worker.ex +62 -0
  53. package/crawler/lib/crawler.ex +91 -0
  54. package/crawler/mix.exs +78 -0
  55. package/crawler/mix.lock +40 -0
  56. package/crawler/test/fixtures/introducing-elixir.jpg +0 -0
  57. package/crawler/test/integration_test.exs +135 -0
  58. package/crawler/test/lib/crawler/dispatcher/worker_test.exs +7 -0
  59. package/crawler/test/lib/crawler/dispatcher_test.exs +5 -0
  60. package/crawler/test/lib/crawler/fetcher/header_preparer_test.exs +7 -0
  61. package/crawler/test/lib/crawler/fetcher/policer_test.exs +71 -0
  62. package/crawler/test/lib/crawler/fetcher/recorder_test.exs +9 -0
  63. package/crawler/test/lib/crawler/fetcher/requester_test.exs +9 -0
  64. package/crawler/test/lib/crawler/fetcher/retrier_test.exs +7 -0
  65. package/crawler/test/lib/crawler/fetcher/url_filter_test.exs +7 -0
  66. package/crawler/test/lib/crawler/fetcher_test.exs +153 -0
  67. package/crawler/test/lib/crawler/http_test.exs +47 -0
  68. package/crawler/test/lib/crawler/linker/path_builder_test.exs +7 -0
  69. package/crawler/test/lib/crawler/linker/path_expander_test.exs +7 -0
  70. package/crawler/test/lib/crawler/linker/path_finder_test.exs +7 -0
  71. package/crawler/test/lib/crawler/linker/path_offliner_test.exs +7 -0
  72. package/crawler/test/lib/crawler/linker/path_prefixer_test.exs +7 -0
  73. package/crawler/test/lib/crawler/linker_test.exs +7 -0
  74. package/crawler/test/lib/crawler/options_test.exs +7 -0
  75. package/crawler/test/lib/crawler/parser/css_parser_test.exs +7 -0
  76. package/crawler/test/lib/crawler/parser/guarder_test.exs +7 -0
  77. package/crawler/test/lib/crawler/parser/html_parser_test.exs +7 -0
  78. package/crawler/test/lib/crawler/parser/link_parser/link_expander_test.exs +7 -0
  79. package/crawler/test/lib/crawler/parser/link_parser_test.exs +7 -0
  80. package/crawler/test/lib/crawler/parser_test.exs +8 -0
  81. package/crawler/test/lib/crawler/queue_handler_test.exs +7 -0
  82. package/crawler/test/lib/crawler/scraper_test.exs +7 -0
  83. package/crawler/test/lib/crawler/snapper/dir_maker_test.exs +7 -0
  84. package/crawler/test/lib/crawler/snapper/link_replacer_test.exs +7 -0
  85. package/crawler/test/lib/crawler/snapper_test.exs +9 -0
  86. package/crawler/test/lib/crawler/worker_test.exs +5 -0
  87. package/crawler/test/lib/crawler_test.exs +295 -0
  88. package/crawler/test/support/test_case.ex +24 -0
  89. package/crawler/test/support/test_helpers.ex +28 -0
  90. package/crawler/test/test_helper.exs +7 -0
  91. package/grell/.rspec +2 -0
  92. package/grell/.travis.yml +28 -0
  93. package/grell/CHANGELOG.md +111 -0
  94. package/grell/Gemfile +7 -0
  95. package/grell/LICENSE.txt +22 -0
  96. package/grell/README.md +213 -0
  97. package/grell/Rakefile +2 -0
  98. package/grell/grell.gemspec +36 -0
  99. package/grell/lib/grell/capybara_driver.rb +44 -0
  100. package/grell/lib/grell/crawler.rb +83 -0
  101. package/grell/lib/grell/crawler_manager.rb +84 -0
  102. package/grell/lib/grell/grell_logger.rb +10 -0
  103. package/grell/lib/grell/page.rb +275 -0
  104. package/grell/lib/grell/page_collection.rb +62 -0
  105. package/grell/lib/grell/rawpage.rb +62 -0
  106. package/grell/lib/grell/reader.rb +18 -0
  107. package/grell/lib/grell/version.rb +3 -0
  108. package/grell/lib/grell.rb +11 -0
  109. package/grell/spec/lib/capybara_driver_spec.rb +38 -0
  110. package/grell/spec/lib/crawler_manager_spec.rb +174 -0
  111. package/grell/spec/lib/crawler_spec.rb +361 -0
  112. package/grell/spec/lib/page_collection_spec.rb +159 -0
  113. package/grell/spec/lib/page_spec.rb +418 -0
  114. package/grell/spec/lib/reader_spec.rb +43 -0
  115. package/grell/spec/spec_helper.rb +66 -0
  116. package/heartmagic/config.py +1 -0
  117. package/heartmagic/heart.py +3 -0
  118. package/heartmagic/pytransform/__init__.py +483 -0
  119. package/heartmagic/pytransform/_pytransform.dll +0 -0
  120. package/heartmagic/pytransform/_pytransform.so +0 -0
  121. package/httpStatusCode/README.md +2 -0
  122. package/httpStatusCode/httpStatusCode.js +4 -0
  123. package/httpStatusCode/reasonPhrases.js +344 -0
  124. package/httpStatusCode/statusCodes.js +344 -0
  125. package/package.json +1 -1
  126. package/rubyretriever/.rspec +2 -0
  127. package/rubyretriever/.travis.yml +7 -0
  128. package/rubyretriever/Gemfile +3 -0
  129. package/rubyretriever/Gemfile.lock +64 -0
  130. package/rubyretriever/LICENSE +20 -0
  131. package/rubyretriever/Rakefile +7 -0
  132. package/rubyretriever/bin/rr +79 -0
  133. package/rubyretriever/lib/retriever/cli.rb +25 -0
  134. package/rubyretriever/lib/retriever/core_ext.rb +13 -0
  135. package/rubyretriever/lib/retriever/fetch.rb +268 -0
  136. package/rubyretriever/lib/retriever/fetchfiles.rb +71 -0
  137. package/rubyretriever/lib/retriever/fetchseo.rb +18 -0
  138. package/rubyretriever/lib/retriever/fetchsitemap.rb +43 -0
  139. package/rubyretriever/lib/retriever/link.rb +47 -0
  140. package/rubyretriever/lib/retriever/openuri_redirect_patch.rb +8 -0
  141. package/rubyretriever/lib/retriever/page.rb +104 -0
  142. package/rubyretriever/lib/retriever/page_iterator.rb +21 -0
  143. package/rubyretriever/lib/retriever/target.rb +47 -0
  144. package/rubyretriever/lib/retriever/version.rb +4 -0
  145. package/rubyretriever/lib/retriever.rb +15 -0
  146. package/rubyretriever/readme.md +166 -0
  147. package/rubyretriever/rubyretriever.gemspec +41 -0
  148. package/rubyretriever/spec/link_spec.rb +77 -0
  149. package/rubyretriever/spec/page_spec.rb +94 -0
  150. package/rubyretriever/spec/retriever_spec.rb +84 -0
  151. package/rubyretriever/spec/spec_helper.rb +17 -0
  152. package/rubyretriever/spec/target_spec.rb +55 -0
  153. package/snapcrawl/.changelog.old.md +157 -0
  154. package/snapcrawl/.gitattributes +1 -0
  155. package/snapcrawl/.github/workflows/test.yml +41 -0
  156. package/snapcrawl/.rspec +3 -0
  157. package/snapcrawl/.rubocop.yml +23 -0
  158. package/snapcrawl/CHANGELOG.md +182 -0
  159. package/snapcrawl/Gemfile +15 -0
  160. package/snapcrawl/LICENSE +21 -0
  161. package/snapcrawl/README.md +135 -0
  162. package/snapcrawl/Runfile +35 -0
  163. package/snapcrawl/bin/snapcrawl +25 -0
  164. package/snapcrawl/lib/snapcrawl/cli.rb +52 -0
  165. package/snapcrawl/lib/snapcrawl/config.rb +60 -0
  166. package/snapcrawl/lib/snapcrawl/crawler.rb +98 -0
  167. package/snapcrawl/lib/snapcrawl/dependencies.rb +21 -0
  168. package/snapcrawl/lib/snapcrawl/exceptions.rb +5 -0
  169. package/snapcrawl/lib/snapcrawl/log_helpers.rb +36 -0
  170. package/snapcrawl/lib/snapcrawl/page.rb +118 -0
  171. package/snapcrawl/lib/snapcrawl/pretty_logger.rb +11 -0
  172. package/snapcrawl/lib/snapcrawl/refinements/pair_split.rb +26 -0
  173. package/snapcrawl/lib/snapcrawl/refinements/string_refinements.rb +13 -0
  174. package/snapcrawl/lib/snapcrawl/screenshot.rb +73 -0
  175. package/snapcrawl/lib/snapcrawl/templates/config.yml +49 -0
  176. package/snapcrawl/lib/snapcrawl/templates/docopt.txt +26 -0
  177. package/snapcrawl/lib/snapcrawl/version.rb +3 -0
  178. package/snapcrawl/lib/snapcrawl.rb +20 -0
  179. package/snapcrawl/snapcrawl.gemspec +27 -0
  180. package/snapcrawl/snapcrawl.yml +41 -0
  181. package/snapcrawl/spec/README.md +16 -0
  182. package/snapcrawl/spec/approvals/bin/help +26 -0
  183. package/snapcrawl/spec/approvals/bin/usage +4 -0
  184. package/snapcrawl/spec/approvals/cli/usage +4 -0
  185. package/snapcrawl/spec/approvals/config/defaults +15 -0
  186. package/snapcrawl/spec/approvals/config/minimal +15 -0
  187. package/snapcrawl/spec/approvals/integration/blacklist +14 -0
  188. package/snapcrawl/spec/approvals/integration/default-config +14 -0
  189. package/snapcrawl/spec/approvals/integration/depth-0 +6 -0
  190. package/snapcrawl/spec/approvals/integration/depth-3 +6 -0
  191. package/snapcrawl/spec/approvals/integration/log-color-no +6 -0
  192. package/snapcrawl/spec/approvals/integration/screenshot-error +3 -0
  193. package/snapcrawl/spec/approvals/integration/whitelist +14 -0
  194. package/snapcrawl/spec/approvals/models/pretty_logger/colors +1 -0
  195. package/snapcrawl/spec/fixtures/config/minimal.yml +4 -0
  196. package/snapcrawl/spec/server/config.ru +97 -0
  197. package/snapcrawl/spec/snapcrawl/bin_spec.rb +15 -0
  198. package/snapcrawl/spec/snapcrawl/cli_spec.rb +9 -0
  199. package/snapcrawl/spec/snapcrawl/config_spec.rb +26 -0
  200. package/snapcrawl/spec/snapcrawl/integration_spec.rb +65 -0
  201. package/snapcrawl/spec/snapcrawl/page_spec.rb +89 -0
  202. package/snapcrawl/spec/snapcrawl/pretty_logger_spec.rb +19 -0
  203. package/snapcrawl/spec/snapcrawl/refinements/pair_split_spec.rb +27 -0
  204. package/snapcrawl/spec/snapcrawl/refinements/string_refinements_spec.rb +29 -0
  205. package/snapcrawl/spec/snapcrawl/screenshot_spec.rb +62 -0
  206. package/snapcrawl/spec/spec_helper.rb +22 -0
  207. package/snapcrawl/spec/spec_mixin.rb +10 -0
@@ -0,0 +1,344 @@
1
+ module.exports = {
2
+ /**
3
+ * Official Documentation @ https://tools.ietf.org/html/rfc7231#section-6.2.1
4
+ *
5
+ * This interim response indicates that everything so far is OK and that the client should continue with the request or ignore it if it is already finished.
6
+ */
7
+ CONTINUE: 100,
8
+ /**
9
+ * Official Documentation @ https://tools.ietf.org/html/rfc7231#section-6.2.2
10
+ *
11
+ * This code is sent in response to an Upgrade request header by the client, and indicates the protocol the server is switching too.
12
+ */
13
+ SWITCHING_PROTOCOLS: 101,
14
+ /**
15
+ * Official Documentation @ https://tools.ietf.org/html/rfc2518#section-10.1
16
+ *
17
+ * This code indicates that the server has received and is processing the request, but no response is available yet.
18
+ */
19
+ PROCESSING: 102,
20
+ /**
21
+ * Official Documentation @ https://tools.ietf.org/html/rfc7231#section-6.3.1
22
+ *
23
+ * The request has succeeded. The meaning of a success varies depending on the HTTP method:
24
+ * GET: The resource has been fetched and is transmitted in the message body.
25
+ * HEAD: The entity headers are in the message body.
26
+ * POST: The resource describing the result of the action is transmitted in the message body.
27
+ * TRACE: The message body contains the request message as received by the server
28
+ */
29
+ OK: 200,
30
+ /**
31
+ * Official Documentation @ https://tools.ietf.org/html/rfc7231#section-6.3.2
32
+ *
33
+ * The request has succeeded and a new resource has been created as a result of it. This is typically the response sent after a PUT request.
34
+ */
35
+ CREATED: 201,
36
+ /**
37
+ * Official Documentation @ https://tools.ietf.org/html/rfc7231#section-6.3.3
38
+ *
39
+ * The request has been received but not yet acted upon. It is non-committal, meaning that there is no way in HTTP to later send an asynchronous response indicating the outcome of processing the request. It is intended for cases where another process or server handles the request, or for batch processing.
40
+ */
41
+ ACCEPTED: 202,
42
+ /**
43
+ * Official Documentation @ https://tools.ietf.org/html/rfc7231#section-6.3.4
44
+ *
45
+ * This response code means returned meta-information set is not exact set as available from the origin server, but collected from a local or a third party copy. Except this condition, 200 OK response should be preferred instead of this response.
46
+ */
47
+ NON_AUTHORITATIVE_INFORMATION: 203,
48
+ /**
49
+ * Official Documentation @ https://tools.ietf.org/html/rfc7231#section-6.3.5
50
+ *
51
+ * There is no content to send for this request, but the headers may be useful. The user-agent may update its cached headers for this resource with the new ones.
52
+ */
53
+ NO_CONTENT: 204,
54
+ /**
55
+ * Official Documentation @ https://tools.ietf.org/html/rfc7231#section-6.3.6
56
+ *
57
+ * This response code is sent after accomplishing request to tell user agent reset document view which sent this request.
58
+ */
59
+ RESET_CONTENT: 205,
60
+ /**
61
+ * Official Documentation @ https://tools.ietf.org/html/rfc7233#section-4.1
62
+ *
63
+ * This response code is used because of range header sent by the client to separate download into multiple streams.
64
+ */
65
+ PARTIAL_CONTENT: 206,
66
+ /**
67
+ * Official Documentation @ https://tools.ietf.org/html/rfc2518#section-10.2
68
+ *
69
+ * A Multi-Status response conveys information about multiple resources in situations where multiple status codes might be appropriate.
70
+ */
71
+ MULTI_STATUS: 207,
72
+ /**
73
+ * Official Documentation @ https://tools.ietf.org/html/rfc7231#section-6.4.1
74
+ *
75
+ * The request has more than one possible responses. User-agent or user should choose one of them. There is no standardized way to choose one of the responses.
76
+ */
77
+ MULTIPLE_CHOICES: 300,
78
+ /**
79
+ * Official Documentation @ https://tools.ietf.org/html/rfc7231#section-6.4.2
80
+ *
81
+ * This response code means that URI of requested resource has been changed. Probably, new URI would be given in the response.
82
+ */
83
+ MOVED_PERMANENTLY: 301,
84
+ /**
85
+ * Official Documentation @ https://tools.ietf.org/html/rfc7231#section-6.4.3
86
+ *
87
+ * This response code means that URI of requested resource has been changed temporarily. New changes in the URI might be made in the future. Therefore, this same URI should be used by the client in future requests.
88
+ */
89
+ MOVED_TEMPORARILY: 302,
90
+ /**
91
+ * Official Documentation @ https://tools.ietf.org/html/rfc7231#section-6.4.4
92
+ *
93
+ * Server sent this response to directing client to get requested resource to another URI with an GET request.
94
+ */
95
+ SEE_OTHER: 303,
96
+ /**
97
+ * Official Documentation @ https://tools.ietf.org/html/rfc7232#section-4.1
98
+ *
99
+ * This is used for caching purposes. It is telling to client that response has not been modified. So, client can continue to use same cached version of response.
100
+ */
101
+ NOT_MODIFIED: 304,
102
+ /**
103
+ * @deprecated
104
+ * Official Documentation @ https://tools.ietf.org/html/rfc7231#section-6.4.6
105
+ *
106
+ * Was defined in a previous version of the HTTP specification to indicate that a requested response must be accessed by a proxy. It has been deprecated due to security concerns regarding in-band configuration of a proxy.
107
+ */
108
+ USE_PROXY: 305,
109
+ /**
110
+ * Official Documentation @ https://tools.ietf.org/html/rfc7231#section-6.4.7
111
+ *
112
+ * Server sent this response to directing client to get requested resource to another URI with same method that used prior request. This has the same semantic than the 302 Found HTTP response code, with the exception that the user agent must not change the HTTP method used: if a POST was used in the first request, a POST must be used in the second request.
113
+ */
114
+ TEMPORARY_REDIRECT: 307,
115
+ /**
116
+ * Official Documentation @ https://tools.ietf.org/html/rfc7538#section-3
117
+ *
118
+ * This means that the resource is now permanently located at another URI, specified by the Location: HTTP Response header. This has the same semantics as the 301 Moved Permanently HTTP response code, with the exception that the user agent must not change the HTTP method used: if a POST was used in the first request, a POST must be used in the second request.
119
+ */
120
+ PERMANENT_REDIRECT: 308,
121
+ /**
122
+ * Official Documentation @ https://tools.ietf.org/html/rfc7231#section-6.5.1
123
+ *
124
+ * This response means that server could not understand the request due to invalid syntax.
125
+ */
126
+ BAD_REQUEST: 400,
127
+ /**
128
+ * Official Documentation @ https://tools.ietf.org/html/rfc7235#section-3.1
129
+ *
130
+ * Although the HTTP standard specifies "unauthorized", semantically this response means "unauthenticated". That is, the client must authenticate itself to get the requested response.
131
+ */
132
+ UNAUTHORIZED: 401,
133
+ /**
134
+ * Official Documentation @ https://tools.ietf.org/html/rfc7231#section-6.5.2
135
+ *
136
+ * This response code is reserved for future use. Initial aim for creating this code was using it for digital payment systems however this is not used currently.
137
+ */
138
+ PAYMENT_REQUIRED: 402,
139
+ /**
140
+ * Official Documentation @ https://tools.ietf.org/html/rfc7231#section-6.5.3
141
+ *
142
+ * The client does not have access rights to the content, i.e. they are unauthorized, so server is rejecting to give proper response. Unlike 401, the client's identity is known to the server.
143
+ */
144
+ FORBIDDEN: 403,
145
+ /**
146
+ * Official Documentation @ https://tools.ietf.org/html/rfc7231#section-6.5.4
147
+ *
148
+ * The server can not find requested resource. In the browser, this means the URL is not recognized. In an API, this can also mean that the endpoint is valid but the resource itself does not exist. Servers may also send this response instead of 403 to hide the existence of a resource from an unauthorized client. This response code is probably the most famous one due to its frequent occurence on the web.
149
+ */
150
+ NOT_FOUND: 404,
151
+ /**
152
+ * Official Documentation @ https://tools.ietf.org/html/rfc7231#section-6.5.5
153
+ *
154
+ * The request method is known by the server but has been disabled and cannot be used. For example, an API may forbid DELETE-ing a resource. The two mandatory methods, GET and HEAD, must never be disabled and should not return this error code.
155
+ */
156
+ METHOD_NOT_ALLOWED: 405,
157
+ /**
158
+ * Official Documentation @ https://tools.ietf.org/html/rfc7231#section-6.5.6
159
+ *
160
+ * This response is sent when the web server, after performing server-driven content negotiation, doesn't find any content following the criteria given by the user agent.
161
+ */
162
+ NOT_ACCEPTABLE: 406,
163
+ /**
164
+ * Official Documentation @ https://tools.ietf.org/html/rfc7235#section-3.2
165
+ *
166
+ * This is similar to 401 but authentication is needed to be done by a proxy.
167
+ */
168
+ PROXY_AUTHENTICATION_REQUIRED: 407,
169
+ /**
170
+ * Official Documentation @ https://tools.ietf.org/html/rfc7231#section-6.5.7
171
+ *
172
+ * This response is sent on an idle connection by some servers, even without any previous request by the client. It means that the server would like to shut down this unused connection. This response is used much more since some browsers, like Chrome, Firefox 27+, or IE9, use HTTP pre-connection mechanisms to speed up surfing. Also note that some servers merely shut down the connection without sending this message.
173
+ */
174
+ REQUEST_TIMEOUT: 408,
175
+ /**
176
+ * Official Documentation @ https://tools.ietf.org/html/rfc7231#section-6.5.8
177
+ *
178
+ * This response is sent when a request conflicts with the current state of the server.
179
+ */
180
+ CONFLICT: 409,
181
+ /**
182
+ * Official Documentation @ https://tools.ietf.org/html/rfc7231#section-6.5.9
183
+ *
184
+ * This response would be sent when the requested content has been permenantly deleted from server, with no forwarding address. Clients are expected to remove their caches and links to the resource. The HTTP specification intends this status code to be used for "limited-time, promotional services". APIs should not feel compelled to indicate resources that have been deleted with this status code.
185
+ */
186
+ GONE: 410,
187
+ /**
188
+ * Official Documentation @ https://tools.ietf.org/html/rfc7231#section-6.5.10
189
+ *
190
+ * The server rejected the request because the Content-Length header field is not defined and the server requires it.
191
+ */
192
+ LENGTH_REQUIRED: 411,
193
+ /**
194
+ * Official Documentation @ https://tools.ietf.org/html/rfc7232#section-4.2
195
+ *
196
+ * The client has indicated preconditions in its headers which the server does not meet.
197
+ */
198
+ PRECONDITION_FAILED: 412,
199
+ /**
200
+ * Official Documentation @ https://tools.ietf.org/html/rfc7231#section-6.5.11
201
+ *
202
+ * Request entity is larger than limits defined by server; the server might close the connection or return an Retry-After header field.
203
+ */
204
+ REQUEST_TOO_LONG: 413,
205
+ /**
206
+ * Official Documentation @ https://tools.ietf.org/html/rfc7231#section-6.5.12
207
+ *
208
+ * The URI requested by the client is longer than the server is willing to interpret.
209
+ */
210
+ REQUEST_URI_TOO_LONG: 414,
211
+ /**
212
+ * Official Documentation @ https://tools.ietf.org/html/rfc7231#section-6.5.13
213
+ *
214
+ * The media format of the requested data is not supported by the server, so the server is rejecting the request.
215
+ */
216
+ UNSUPPORTED_MEDIA_TYPE: 415,
217
+ /**
218
+ * Official Documentation @ https://tools.ietf.org/html/rfc7233#section-4.4
219
+ *
220
+ * The range specified by the Range header field in the request can't be fulfilled; it's possible that the range is outside the size of the target URI's data.
221
+ */
222
+ REQUESTED_RANGE_NOT_SATISFIABLE: 416,
223
+ /**
224
+ * Official Documentation @ https://tools.ietf.org/html/rfc7231#section-6.5.14
225
+ *
226
+ * This response code means the expectation indicated by the Expect request header field can't be met by the server.
227
+ */
228
+ EXPECTATION_FAILED: 417,
229
+ /**
230
+ * Official Documentation @ https://tools.ietf.org/html/rfc2324#section-2.3.2
231
+ *
232
+ * Any attempt to brew coffee with a teapot should result in the error code "418 I'm a teapot". The resulting entity body MAY be short and stout.
233
+ */
234
+ IM_A_TEAPOT: 418,
235
+ /**
236
+ * Official Documentation @ https://tools.ietf.org/html/rfc2518#section-10.6
237
+ *
238
+ * The 507 (Insufficient Storage) status code means the method could not be performed on the resource because the server is unable to store the representation needed to successfully complete the request. This condition is considered to be temporary. If the request which received this status code was the result of a user action, the request MUST NOT be repeated until it is requested by a separate user action.
239
+ */
240
+ INSUFFICIENT_SPACE_ON_RESOURCE: 419,
241
+ /**
242
+ * @deprecated
243
+ * Official Documentation @ https://tools.ietf.org/rfcdiff?difftype=--hwdiff&url2=draft-ietf-webdav-protocol-06.txt
244
+ *
245
+ * A deprecated response used by the Spring Framework when a method has failed.
246
+ */
247
+ METHOD_FAILURE: 420,
248
+ /**
249
+ * Official Documentation @ https://datatracker.ietf.org/doc/html/rfc7540#section-9.1.2
250
+ *
251
+ * Defined in the specification of HTTP/2 to indicate that a server is not able to produce a response for the combination of scheme and authority that are included in the request URI.
252
+ */
253
+ MISDIRECTED_REQUEST: 421,
254
+ /**
255
+ * Official Documentation @ https://tools.ietf.org/html/rfc2518#section-10.3
256
+ *
257
+ * The request was well-formed but was unable to be followed due to semantic errors.
258
+ */
259
+ UNPROCESSABLE_ENTITY: 422,
260
+ /**
261
+ * Official Documentation @ https://tools.ietf.org/html/rfc2518#section-10.4
262
+ *
263
+ * The resource that is being accessed is locked.
264
+ */
265
+ LOCKED: 423,
266
+ /**
267
+ * Official Documentation @ https://tools.ietf.org/html/rfc2518#section-10.5
268
+ *
269
+ * The request failed due to failure of a previous request.
270
+ */
271
+ FAILED_DEPENDENCY: 424,
272
+ /**
273
+ * Official Documentation @ https://tools.ietf.org/html/rfc6585#section-3
274
+ *
275
+ * The origin server requires the request to be conditional. Intended to prevent the 'lost update' problem, where a client GETs a resource's state, modifies it, and PUTs it back to the server, when meanwhile a third party has modified the state on the server, leading to a conflict.
276
+ */
277
+ PRECONDITION_REQUIRED: 428,
278
+ /**
279
+ * Official Documentation @ https://tools.ietf.org/html/rfc6585#section-4
280
+ *
281
+ * The user has sent too many requests in a given amount of time ("rate limiting").
282
+ */
283
+ TOO_MANY_REQUESTS: 429,
284
+ /**
285
+ * Official Documentation @ https://tools.ietf.org/html/rfc6585#section-5
286
+ *
287
+ * The server is unwilling to process the request because its header fields are too large. The request MAY be resubmitted after reducing the size of the request header fields.
288
+ */
289
+ REQUEST_HEADER_FIELDS_TOO_LARGE: 431,
290
+ /**
291
+ * Official Documentation @ https://tools.ietf.org/html/rfc7725
292
+ *
293
+ * The user-agent requested a resource that cannot legally be provided, such as a web page censored by a government.
294
+ */
295
+ UNAVAILABLE_FOR_LEGAL_REASONS: 451,
296
+ /**
297
+ * Official Documentation @ https://tools.ietf.org/html/rfc7231#section-6.6.1
298
+ *
299
+ * The server encountered an unexpected condition that prevented it from fulfilling the request.
300
+ */
301
+ INTERNAL_SERVER_ERROR: 500,
302
+ /**
303
+ * Official Documentation @ https://tools.ietf.org/html/rfc7231#section-6.6.2
304
+ *
305
+ * The request method is not supported by the server and cannot be handled. The only methods that servers are required to support (and therefore that must not return this code) are GET and HEAD.
306
+ */
307
+ NOT_IMPLEMENTED: 501,
308
+ /**
309
+ * Official Documentation @ https://tools.ietf.org/html/rfc7231#section-6.6.3
310
+ *
311
+ * This error response means that the server, while working as a gateway to get a response needed to handle the request, got an invalid response.
312
+ */
313
+ BAD_GATEWAY: 502,
314
+ /**
315
+ * Official Documentation @ https://tools.ietf.org/html/rfc7231#section-6.6.4
316
+ *
317
+ * The server is not ready to handle the request. Common causes are a server that is down for maintenance or that is overloaded. Note that together with this response, a user-friendly page explaining the problem should be sent. This responses should be used for temporary conditions and the Retry-After: HTTP header should, if possible, contain the estimated time before the recovery of the service. The webmaster must also take care about the caching-related headers that are sent along with this response, as these temporary condition responses should usually not be cached.
318
+ */
319
+ SERVICE_UNAVAILABLE: 503,
320
+ /**
321
+ * Official Documentation @ https://tools.ietf.org/html/rfc7231#section-6.6.5
322
+ *
323
+ * This error response is given when the server is acting as a gateway and cannot get a response in time.
324
+ */
325
+ GATEWAY_TIMEOUT: 504,
326
+ /**
327
+ * Official Documentation @ https://tools.ietf.org/html/rfc7231#section-6.6.6
328
+ *
329
+ * The HTTP version used in the request is not supported by the server.
330
+ */
331
+ HTTP_VERSION_NOT_SUPPORTED: 505,
332
+ /**
333
+ * Official Documentation @ https://tools.ietf.org/html/rfc2518#section-10.6
334
+ *
335
+ * The server has an internal configuration error: the chosen variant resource is configured to engage in transparent content negotiation itself, and is therefore not a proper end point in the negotiation process.
336
+ */
337
+ INSUFFICIENT_STORAGE: 507,
338
+ /**
339
+ * Official Documentation @ https://tools.ietf.org/html/rfc6585#section-6
340
+ *
341
+ * The 511 status code indicates that the client needs to authenticate to gain network access.
342
+ */
343
+ NETWORK_AUTHENTICATION_REQUIRED: 511
344
+ }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "powerdlz23",
3
- "version": "1.2.3",
3
+ "version": "1.2.5",
4
4
  "scripts": {
5
5
  "dev": "next dev",
6
6
  "build": "next build",
@@ -0,0 +1,2 @@
1
+ --color
2
+ --format progress
@@ -0,0 +1,7 @@
1
+ language: ruby
2
+ rvm:
3
+ - 2.0.0
4
+ cache: bundler
5
+ before_install: gem install rspec
6
+ before_install: gem install rake
7
+ script: bundle exec rake
@@ -0,0 +1,3 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gemspec
@@ -0,0 +1,64 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ rubyretriever (1.4.5)
5
+ addressable
6
+ bloomfilter-rb
7
+ em-http-request
8
+ em-synchrony
9
+ htmlentities
10
+ nokogiri
11
+ ruby-progressbar
12
+
13
+ GEM
14
+ remote: https://rubygems.org/
15
+ specs:
16
+ addressable (2.4.0)
17
+ bloomfilter-rb (2.1.1)
18
+ redis
19
+ coderay (1.1.0)
20
+ cookiejar (0.3.0)
21
+ diff-lcs (1.2.5)
22
+ em-http-request (1.1.3)
23
+ addressable (>= 2.3.4)
24
+ cookiejar (<= 0.3.0)
25
+ em-socksify (>= 0.3)
26
+ eventmachine (>= 1.0.3)
27
+ http_parser.rb (>= 0.6.0)
28
+ em-socksify (0.3.1)
29
+ eventmachine (>= 1.0.0.beta.4)
30
+ em-synchrony (1.0.4)
31
+ eventmachine (>= 1.0.0.beta.1)
32
+ eventmachine (1.2.0.1)
33
+ htmlentities (4.3.4)
34
+ http_parser.rb (0.6.0)
35
+ method_source (0.8.2)
36
+ mini_portile2 (2.0.0)
37
+ nokogiri (1.6.7.2)
38
+ mini_portile2 (~> 2.0.0.rc2)
39
+ pry (0.10.1)
40
+ coderay (~> 1.1.0)
41
+ method_source (~> 0.8.1)
42
+ slop (~> 3.4)
43
+ rake (10.3.2)
44
+ redis (3.2.2)
45
+ rspec (2.99.0)
46
+ rspec-core (~> 2.99.0)
47
+ rspec-expectations (~> 2.99.0)
48
+ rspec-mocks (~> 2.99.0)
49
+ rspec-core (2.99.0)
50
+ rspec-expectations (2.99.0)
51
+ diff-lcs (>= 1.1.3, < 2.0)
52
+ rspec-mocks (2.99.0)
53
+ ruby-progressbar (1.7.5)
54
+ slop (3.6.0)
55
+
56
+ PLATFORMS
57
+ ruby
58
+
59
+ DEPENDENCIES
60
+ bundler (~> 1.6)
61
+ pry
62
+ rake (~> 10.3)
63
+ rspec (~> 2.14)
64
+ rubyretriever!
@@ -0,0 +1,20 @@
1
+ 2016 (c) Joseph Michael Norton - @JoeNorton - http://Norton.io
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,7 @@
1
+ require 'rspec/core/rake_task'
2
+
3
+ RSpec::Core::RakeTask.new(:spec) do |task|
4
+ task.rspec_opts = %w(--color --format d)
5
+ end
6
+
7
+ task default: :spec
@@ -0,0 +1,79 @@
1
+ #! /usr/bin/env ruby
2
+ require 'retriever'
3
+ require 'optparse'
4
+
5
+ options = {}
6
+ optparse = OptionParser.new do |opts|
7
+ # Set a banner, displayed at the top
8
+ # of the help screen.
9
+ opts.banner = 'Usage: rr [MODE FLAG] [options] Target_URL'
10
+ options['sitemap'] = false
11
+ opts.on('-s', '--sitemap [FORMAT]', 'MODE FLAG: Sitemap mode') do |type|
12
+ options['sitemap'] = type || ''
13
+ end
14
+ options['fileharvest'] = false
15
+ opts.on('-f', '--files FILETYPE', 'MODE FLAG: Fileharvest mode') do |file_e|
16
+ options['fileharvest'] = file_e
17
+ end
18
+ options['seo'] = false
19
+ opts.on('-e', '--seo', 'MODE FLAG: SEO mode') do
20
+ options['seo'] = true
21
+ end
22
+ options['filename'] = nil
23
+ opts.on('-o', '--out FILENAME', 'Dump output to file') do |file|
24
+ options['filename'] = file
25
+ end
26
+ # Define the options, and what they do
27
+ options['verbose'] = false
28
+ opts.on('-v', '--verbose', 'Output more information') do
29
+ options['verbose'] = true
30
+ end
31
+ options['progress'] = false
32
+ opts.on('-p', '--progress', 'Output progress bar') do
33
+ options['progress'] = true
34
+ end
35
+ options['maxpages'] = 100
36
+ opts.on('-l',
37
+ '--limit PAGE_LIMIT_#',
38
+ 'set a max on the total number of crawled pages') do |maxp|
39
+ options['maxpages'] = maxp
40
+ end
41
+ options['autodown'] = false
42
+ opts.on('-a', '--auto', 'Automatically download all files located') do
43
+ options['autodown'] = true
44
+ end
45
+ # This displays the help screen, all programs are
46
+ # assumed to have this option.
47
+ opts.on('-h', '--help', 'Display this screen') do
48
+ puts opts
49
+ exit
50
+ end
51
+ end
52
+
53
+ optparse.parse!
54
+ if ARGV[0].nil?
55
+ abort("###Missing Required Argument\nUsage: rr [mode] [options] Target_URL")
56
+ end
57
+
58
+ ARGV.each do|q|
59
+ if options['verbose']
60
+ puts '###############################'
61
+ puts '### [RubyRetriever]'
62
+ puts '### Creating Sitemap' if options['sitemap']
63
+ puts "### Outputting in format: #{options['sitemap']}" if options['sitemap']
64
+ puts '### Performing File Harvest' if options['fileharvest']
65
+ if options['fileharvest']
66
+ puts "### Searching for filetype: #{options['fileharvest']}"
67
+ end
68
+ puts '### Performing SEO Scrape' if options['seo']
69
+ puts "### Writing to file: #{options['filename']}" if options['filename']
70
+ puts '### Being verbose'
71
+ puts "### Stopping after #{options['maxpages']} pages"
72
+ end
73
+ puts '###############################'
74
+ puts "### [RubyRetriever] go fetch #{q}"
75
+ Retriever::CLI.new(q, options)
76
+ puts '### [RubyRetriever] is done.'
77
+ puts '###############################'
78
+ puts
79
+ end
@@ -0,0 +1,25 @@
1
+ module Retriever
2
+ #
3
+ class CLI
4
+ def initialize(url, options)
5
+ # kick off the fetch mode of choice
6
+ @fetch = choose_fetch_mode(url, options)
7
+ @fetch.dump
8
+ @fetch.write if options['filename']
9
+ @fetch.autodownload if options['autodown'] && options['fileharvest']
10
+ @fetch.gen_xml if /XML/i =~ options['sitemap'].to_s
11
+ end
12
+
13
+ def choose_fetch_mode(url, options)
14
+ if options['fileharvest']
15
+ Retriever::FetchFiles.new(url, options)
16
+ elsif options['sitemap']
17
+ Retriever::FetchSitemap.new(url, options)
18
+ elsif options['seo']
19
+ Retriever::FetchSEO.new(url, options)
20
+ else
21
+ fail '### Error: No Mode Selected'
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,13 @@
1
+ require 'htmlentities'
2
+ #
3
+ module SourceString
4
+ refine String do
5
+ def decode_html
6
+ HTMLEntities.new.decode(self)
7
+ end
8
+
9
+ def encode_utf8_and_replace
10
+ encode('UTF-8', invalid: :replace, undef: :replace)
11
+ end
12
+ end
13
+ end