powerdlz23 1.2.4 → 1.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/grell/.rspec +2 -0
- package/grell/.travis.yml +28 -0
- package/grell/CHANGELOG.md +111 -0
- package/grell/Gemfile +7 -0
- package/grell/LICENSE.txt +22 -0
- package/grell/README.md +213 -0
- package/grell/Rakefile +2 -0
- package/grell/grell.gemspec +36 -0
- package/grell/lib/grell/capybara_driver.rb +44 -0
- package/grell/lib/grell/crawler.rb +83 -0
- package/grell/lib/grell/crawler_manager.rb +84 -0
- package/grell/lib/grell/grell_logger.rb +10 -0
- package/grell/lib/grell/page.rb +275 -0
- package/grell/lib/grell/page_collection.rb +62 -0
- package/grell/lib/grell/rawpage.rb +62 -0
- package/grell/lib/grell/reader.rb +18 -0
- package/grell/lib/grell/version.rb +3 -0
- package/grell/lib/grell.rb +11 -0
- package/grell/spec/lib/capybara_driver_spec.rb +38 -0
- package/grell/spec/lib/crawler_manager_spec.rb +174 -0
- package/grell/spec/lib/crawler_spec.rb +361 -0
- package/grell/spec/lib/page_collection_spec.rb +159 -0
- package/grell/spec/lib/page_spec.rb +418 -0
- package/grell/spec/lib/reader_spec.rb +43 -0
- package/grell/spec/spec_helper.rb +66 -0
- package/heartmagic/config.py +1 -0
- package/heartmagic/heart.py +3 -0
- package/heartmagic/pytransform/__init__.py +483 -0
- package/heartmagic/pytransform/_pytransform.dll +0 -0
- package/heartmagic/pytransform/_pytransform.so +0 -0
- package/httpStatusCode/README.md +2 -0
- package/httpStatusCode/httpStatusCode.js +4 -0
- package/httpStatusCode/reasonPhrases.js +344 -0
- package/httpStatusCode/statusCodes.js +344 -0
- package/package.json +1 -1
- package/snapcrawl/.changelog.old.md +157 -0
- package/snapcrawl/.gitattributes +1 -0
- package/snapcrawl/.github/workflows/test.yml +41 -0
- package/snapcrawl/.rspec +3 -0
- package/snapcrawl/.rubocop.yml +23 -0
- package/snapcrawl/CHANGELOG.md +182 -0
- package/snapcrawl/Gemfile +15 -0
- package/snapcrawl/LICENSE +21 -0
- package/snapcrawl/README.md +135 -0
- package/snapcrawl/Runfile +35 -0
- package/snapcrawl/bin/snapcrawl +25 -0
- package/snapcrawl/lib/snapcrawl/cli.rb +52 -0
- package/snapcrawl/lib/snapcrawl/config.rb +60 -0
- package/snapcrawl/lib/snapcrawl/crawler.rb +98 -0
- package/snapcrawl/lib/snapcrawl/dependencies.rb +21 -0
- package/snapcrawl/lib/snapcrawl/exceptions.rb +5 -0
- package/snapcrawl/lib/snapcrawl/log_helpers.rb +36 -0
- package/snapcrawl/lib/snapcrawl/page.rb +118 -0
- package/snapcrawl/lib/snapcrawl/pretty_logger.rb +11 -0
- package/snapcrawl/lib/snapcrawl/refinements/pair_split.rb +26 -0
- package/snapcrawl/lib/snapcrawl/refinements/string_refinements.rb +13 -0
- package/snapcrawl/lib/snapcrawl/screenshot.rb +73 -0
- package/snapcrawl/lib/snapcrawl/templates/config.yml +49 -0
- package/snapcrawl/lib/snapcrawl/templates/docopt.txt +26 -0
- package/snapcrawl/lib/snapcrawl/version.rb +3 -0
- package/snapcrawl/lib/snapcrawl.rb +20 -0
- package/snapcrawl/snapcrawl.gemspec +27 -0
- package/snapcrawl/snapcrawl.yml +41 -0
- package/snapcrawl/spec/README.md +16 -0
- package/snapcrawl/spec/approvals/bin/help +26 -0
- package/snapcrawl/spec/approvals/bin/usage +4 -0
- package/snapcrawl/spec/approvals/cli/usage +4 -0
- package/snapcrawl/spec/approvals/config/defaults +15 -0
- package/snapcrawl/spec/approvals/config/minimal +15 -0
- package/snapcrawl/spec/approvals/integration/blacklist +14 -0
- package/snapcrawl/spec/approvals/integration/default-config +14 -0
- package/snapcrawl/spec/approvals/integration/depth-0 +6 -0
- package/snapcrawl/spec/approvals/integration/depth-3 +6 -0
- package/snapcrawl/spec/approvals/integration/log-color-no +6 -0
- package/snapcrawl/spec/approvals/integration/screenshot-error +3 -0
- package/snapcrawl/spec/approvals/integration/whitelist +14 -0
- package/snapcrawl/spec/approvals/models/pretty_logger/colors +1 -0
- package/snapcrawl/spec/fixtures/config/minimal.yml +4 -0
- package/snapcrawl/spec/server/config.ru +97 -0
- package/snapcrawl/spec/snapcrawl/bin_spec.rb +15 -0
- package/snapcrawl/spec/snapcrawl/cli_spec.rb +9 -0
- package/snapcrawl/spec/snapcrawl/config_spec.rb +26 -0
- package/snapcrawl/spec/snapcrawl/integration_spec.rb +65 -0
- package/snapcrawl/spec/snapcrawl/page_spec.rb +89 -0
- package/snapcrawl/spec/snapcrawl/pretty_logger_spec.rb +19 -0
- package/snapcrawl/spec/snapcrawl/refinements/pair_split_spec.rb +27 -0
- package/snapcrawl/spec/snapcrawl/refinements/string_refinements_spec.rb +29 -0
- package/snapcrawl/spec/snapcrawl/screenshot_spec.rb +62 -0
- package/snapcrawl/spec/spec_helper.rb +22 -0
- package/snapcrawl/spec/spec_mixin.rb +10 -0
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
Change Log
|
|
2
|
+
========================================
|
|
3
|
+
|
|
4
|
+
v0.5.4 - 2023-07-27
|
|
5
|
+
----------------------------------------
|
|
6
|
+
|
|
7
|
+
- Drop support for Ruby <= 2.6
|
|
8
|
+
- Upgrade dependencies and rubocop cleanup
|
|
9
|
+
- Fix css_selector option
|
|
10
|
+
- Drop support for Ruby 2.x
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
v0.5.3 - 2021-03-29
|
|
14
|
+
----------------------------------------
|
|
15
|
+
|
|
16
|
+
- Add skip_ssl_verification config option
|
|
17
|
+
- Add screenshot_delay config option
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
v0.5.2 - 2021-02-25
|
|
21
|
+
----------------------------------------
|
|
22
|
+
|
|
23
|
+
- Fix logging percent issue
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
## [v0.5.1](https://github.com/DannyBen/snapcrawl/tree/v0.5.1) (2020-03-14)
|
|
27
|
+
|
|
28
|
+
[Full Changelog](https://github.com/DannyBen/snapcrawl/compare/v0.5.0...v0.5.1)
|
|
29
|
+
|
|
30
|
+
**Merged pull requests:**
|
|
31
|
+
|
|
32
|
+
- Add additional test cases and exception safeguards [\#30](https://github.com/DannyBen/snapcrawl/pull/30) ([DannyBen](https://github.com/DannyBen))
|
|
33
|
+
|
|
34
|
+
## [v0.5.0](https://github.com/DannyBen/snapcrawl/tree/v0.5.0) (2020-03-14)
|
|
35
|
+
|
|
36
|
+
[Full Changelog](https://github.com/DannyBen/snapcrawl/compare/v0.5.0.rc1...v0.5.0)
|
|
37
|
+
|
|
38
|
+
**Merged pull requests:**
|
|
39
|
+
|
|
40
|
+
- Epic refactor [\#29](https://github.com/DannyBen/snapcrawl/pull/29) ([DannyBen](https://github.com/DannyBen))
|
|
41
|
+
|
|
42
|
+
## [v0.5.0.rc1](https://github.com/DannyBen/snapcrawl/tree/v0.5.0.rc1) (2020-03-14)
|
|
43
|
+
|
|
44
|
+
[Full Changelog](https://github.com/DannyBen/snapcrawl/compare/v0.4.4...v0.5.0.rc1)
|
|
45
|
+
|
|
46
|
+
## [v0.4.4](https://github.com/DannyBen/snapcrawl/tree/v0.4.4) (2020-03-12)
|
|
47
|
+
|
|
48
|
+
[Full Changelog](https://github.com/DannyBen/snapcrawl/compare/v0.4.3...v0.4.4)
|
|
49
|
+
|
|
50
|
+
**Merged pull requests:**
|
|
51
|
+
|
|
52
|
+
- Rescue imagemagick exceptions [\#28](https://github.com/DannyBen/snapcrawl/pull/28) ([DannyBen](https://github.com/DannyBen))
|
|
53
|
+
- Switch to github actions [\#27](https://github.com/DannyBen/snapcrawl/pull/27) ([DannyBen](https://github.com/DannyBen))
|
|
54
|
+
|
|
55
|
+
## [v0.4.3](https://github.com/DannyBen/snapcrawl/tree/v0.4.3) (2020-01-09)
|
|
56
|
+
|
|
57
|
+
[Full Changelog](https://github.com/DannyBen/snapcrawl/compare/v0.4.2...v0.4.3)
|
|
58
|
+
|
|
59
|
+
## [v0.4.2](https://github.com/DannyBen/snapcrawl/tree/v0.4.2) (2020-01-09)
|
|
60
|
+
|
|
61
|
+
[Full Changelog](https://github.com/DannyBen/snapcrawl/compare/v0.4.1...v0.4.2)
|
|
62
|
+
|
|
63
|
+
**Merged pull requests:**
|
|
64
|
+
|
|
65
|
+
- Improve handling of malformed URIs [\#26](https://github.com/DannyBen/snapcrawl/pull/26) ([DannyBen](https://github.com/DannyBen))
|
|
66
|
+
|
|
67
|
+
## [v0.4.1](https://github.com/DannyBen/snapcrawl/tree/v0.4.1) (2020-01-09)
|
|
68
|
+
|
|
69
|
+
[Full Changelog](https://github.com/DannyBen/snapcrawl/compare/v0.4.0...v0.4.1)
|
|
70
|
+
|
|
71
|
+
**Merged pull requests:**
|
|
72
|
+
|
|
73
|
+
- Updates for ruby 2.7 [\#25](https://github.com/DannyBen/snapcrawl/pull/25) ([DannyBen](https://github.com/DannyBen))
|
|
74
|
+
- Test with ruby 2.7 [\#23](https://github.com/DannyBen/snapcrawl/pull/23) ([DannyBen](https://github.com/DannyBen))
|
|
75
|
+
- Improve error handling [\#20](https://github.com/DannyBen/snapcrawl/pull/20) ([DannyBen](https://github.com/DannyBen))
|
|
76
|
+
|
|
77
|
+
## [v0.4.0](https://github.com/DannyBen/snapcrawl/tree/v0.4.0) (2020-01-01)
|
|
78
|
+
|
|
79
|
+
[Full Changelog](https://github.com/DannyBen/snapcrawl/compare/v0.3.1...v0.4.0)
|
|
80
|
+
|
|
81
|
+
**Merged pull requests:**
|
|
82
|
+
|
|
83
|
+
- Remove go subcommand [\#22](https://github.com/DannyBen/snapcrawl/pull/22) ([DannyBen](https://github.com/DannyBen))
|
|
84
|
+
- Make CI more consistent [\#21](https://github.com/DannyBen/snapcrawl/pull/21) ([DannyBen](https://github.com/DannyBen))
|
|
85
|
+
|
|
86
|
+
## [v0.3.1](https://github.com/DannyBen/snapcrawl/tree/v0.3.1) (2019-09-11)
|
|
87
|
+
|
|
88
|
+
[Full Changelog](https://github.com/DannyBen/snapcrawl/compare/v0.3.0...v0.3.1)
|
|
89
|
+
|
|
90
|
+
**Fixed bugs:**
|
|
91
|
+
|
|
92
|
+
- Try catch error instead of stopping script [\#19](https://github.com/DannyBen/snapcrawl/issues/19)
|
|
93
|
+
- error : Cliver::Dependency::VersionMismatch [\#18](https://github.com/DannyBen/snapcrawl/issues/18)
|
|
94
|
+
- RuntimeError redirection forbidden [\#16](https://github.com/DannyBen/snapcrawl/issues/16)
|
|
95
|
+
|
|
96
|
+
## [v0.3.0](https://github.com/DannyBen/snapcrawl/tree/v0.3.0) (2019-09-10)
|
|
97
|
+
|
|
98
|
+
[Full Changelog](https://github.com/DannyBen/snapcrawl/compare/v0.2.8...v0.3.0)
|
|
99
|
+
|
|
100
|
+
**Merged pull requests:**
|
|
101
|
+
|
|
102
|
+
- Fixes round [\#17](https://github.com/DannyBen/snapcrawl/pull/17) ([DannyBen](https://github.com/DannyBen))
|
|
103
|
+
|
|
104
|
+
## [v0.2.8](https://github.com/DannyBen/snapcrawl/tree/v0.2.8) (2019-06-14)
|
|
105
|
+
|
|
106
|
+
[Full Changelog](https://github.com/DannyBen/snapcrawl/compare/v0.2.7...v0.2.8)
|
|
107
|
+
|
|
108
|
+
**Closed issues:**
|
|
109
|
+
|
|
110
|
+
- Improve tests and run tests on Travis [\#13](https://github.com/DannyBen/snapcrawl/issues/13)
|
|
111
|
+
- Save all versions of snapshot? [\#11](https://github.com/DannyBen/snapcrawl/issues/11)
|
|
112
|
+
|
|
113
|
+
**Merged pull requests:**
|
|
114
|
+
|
|
115
|
+
- Add Travis CI [\#15](https://github.com/DannyBen/snapcrawl/pull/15) ([DannyBen](https://github.com/DannyBen))
|
|
116
|
+
- Add ability to set filename template [\#14](https://github.com/DannyBen/snapcrawl/pull/14) ([DannyBen](https://github.com/DannyBen))
|
|
117
|
+
|
|
118
|
+
## [v0.2.7](https://github.com/DannyBen/snapcrawl/tree/v0.2.7) (2019-06-13)
|
|
119
|
+
|
|
120
|
+
[Full Changelog](https://github.com/DannyBen/snapcrawl/compare/v0.2.6...v0.2.7)
|
|
121
|
+
|
|
122
|
+
**Closed issues:**
|
|
123
|
+
|
|
124
|
+
- Using snapcrawl via proxy? [\#10](https://github.com/DannyBen/snapcrawl/issues/10)
|
|
125
|
+
|
|
126
|
+
**Merged pull requests:**
|
|
127
|
+
|
|
128
|
+
- Fix ignored --folder parameter [\#12](https://github.com/DannyBen/snapcrawl/pull/12) ([DannyBen](https://github.com/DannyBen))
|
|
129
|
+
|
|
130
|
+
## [v0.2.6](https://github.com/DannyBen/snapcrawl/tree/v0.2.6) (2019-04-18)
|
|
131
|
+
|
|
132
|
+
[Full Changelog](https://github.com/DannyBen/snapcrawl/compare/v0.2.5...v0.2.6)
|
|
133
|
+
|
|
134
|
+
**Closed issues:**
|
|
135
|
+
|
|
136
|
+
- Screenshots not saving to default snaps folder on Windows machine [\#6](https://github.com/DannyBen/snapcrawl/issues/6)
|
|
137
|
+
- Add the ability to pass headers into the application [\#3](https://github.com/DannyBen/snapcrawl/issues/3)
|
|
138
|
+
|
|
139
|
+
**Merged pull requests:**
|
|
140
|
+
|
|
141
|
+
- Upgrade colsole to fix windows command\_exist [\#9](https://github.com/DannyBen/snapcrawl/pull/9) ([DannyBen](https://github.com/DannyBen))
|
|
142
|
+
|
|
143
|
+
## [v0.2.5](https://github.com/DannyBen/snapcrawl/tree/v0.2.5) (2019-03-14)
|
|
144
|
+
|
|
145
|
+
[Full Changelog](https://github.com/DannyBen/snapcrawl/compare/v0.2.4...v0.2.5)
|
|
146
|
+
|
|
147
|
+
**Fixed bugs:**
|
|
148
|
+
|
|
149
|
+
- Screenshots not saving to default or specified folder locations [\#4](https://github.com/DannyBen/snapcrawl/issues/4)
|
|
150
|
+
|
|
151
|
+
**Merged pull requests:**
|
|
152
|
+
|
|
153
|
+
- Alert when imagemagick is not installed [\#7](https://github.com/DannyBen/snapcrawl/pull/7) ([DannyBen](https://github.com/DannyBen))
|
|
154
|
+
|
|
155
|
+
## [v0.2.4](https://github.com/DannyBen/snapcrawl/tree/v0.2.4) (2018-10-18)
|
|
156
|
+
|
|
157
|
+
[Full Changelog](https://github.com/DannyBen/snapcrawl/compare/v0.2.3...v0.2.4)
|
|
158
|
+
|
|
159
|
+
**Merged pull requests:**
|
|
160
|
+
|
|
161
|
+
- Switch from screencap to webshot [\#5](https://github.com/DannyBen/snapcrawl/pull/5) ([DannyBen](https://github.com/DannyBen))
|
|
162
|
+
- Switch from minitest to rspec [\#2](https://github.com/DannyBen/snapcrawl/pull/2) ([DannyBen](https://github.com/DannyBen))
|
|
163
|
+
|
|
164
|
+
## [v0.2.3](https://github.com/DannyBen/snapcrawl/tree/v0.2.3) (2017-03-15)
|
|
165
|
+
|
|
166
|
+
[Full Changelog](https://github.com/DannyBen/snapcrawl/compare/v0.2.2...v0.2.3)
|
|
167
|
+
|
|
168
|
+
**Merged pull requests:**
|
|
169
|
+
|
|
170
|
+
- Fixes [\#1](https://github.com/DannyBen/snapcrawl/pull/1) ([DannyBen](https://github.com/DannyBen))
|
|
171
|
+
|
|
172
|
+
## [v0.2.2](https://github.com/DannyBen/snapcrawl/tree/v0.2.2) (2015-12-05)
|
|
173
|
+
|
|
174
|
+
[Full Changelog](https://github.com/DannyBen/snapcrawl/compare/v0.2.1...v0.2.2)
|
|
175
|
+
|
|
176
|
+
## [v0.2.1](https://github.com/DannyBen/snapcrawl/tree/v0.2.1) (2015-12-05)
|
|
177
|
+
|
|
178
|
+
[Full Changelog](https://github.com/DannyBen/snapcrawl/compare/v0.2.0...v0.2.1)
|
|
179
|
+
|
|
180
|
+
## [v0.2.0](https://github.com/DannyBen/snapcrawl/tree/v0.2.0) (2015-12-05)
|
|
181
|
+
|
|
182
|
+
[Full Changelog](https://github.com/DannyBen/snapcrawl/compare/0710e5f8d5e45b5341ae4a9fa2212d5c76c72de4...v0.2.0)
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
The MIT License (MIT)
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2015 Danny Ben Shitrit
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
|
13
|
+
all copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
21
|
+
THE SOFTWARE.
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
# Snapcrawl - crawl a website and take screenshots
|
|
2
|
+
|
|
3
|
+
[](http://badge.fury.io/rb/snapcrawl)
|
|
4
|
+
[](https://github.com/DannyBen/snapcrawl/actions?query=workflow%3ATest)
|
|
5
|
+
[](https://codeclimate.com/github/DannyBen/snapcrawl)
|
|
6
|
+
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
Snapcrawl is a command line utility for crawling a website and saving
|
|
10
|
+
screenshots.
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
## Features
|
|
14
|
+
|
|
15
|
+
- Crawls a website to any given depth and saves screenshots
|
|
16
|
+
- Can capture the full length of the page
|
|
17
|
+
- Can use a specific resolution for screenshots
|
|
18
|
+
- Skips capturing if the screenshot was already saved recently
|
|
19
|
+
- Uses local caching to avoid expensive crawl operations if not needed
|
|
20
|
+
- Reports broken links
|
|
21
|
+
|
|
22
|
+
## Install
|
|
23
|
+
|
|
24
|
+
**Using Docker**
|
|
25
|
+
|
|
26
|
+
You can run Snapcrawl by using this docker image (which contains all the
|
|
27
|
+
necessary prerequisites):
|
|
28
|
+
|
|
29
|
+
```shell
|
|
30
|
+
$ alias snapcrawl='docker run --rm -it --network host --volume "$PWD:/app" dannyben/snapcrawl'
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
For more information on the Docker image, refer to the [docker-snapcrawl][3] repository.
|
|
34
|
+
|
|
35
|
+
**Using Ruby**
|
|
36
|
+
|
|
37
|
+
```shell
|
|
38
|
+
$ gem install snapcrawl
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
Note that Snapcrawl requires [PhantomJS][1] and [ImageMagick][2].
|
|
42
|
+
|
|
43
|
+
## Usage
|
|
44
|
+
|
|
45
|
+
Snapcrawl can be configured either through a configuration file (YAML), or by specifying options in the command line.
|
|
46
|
+
|
|
47
|
+
```shell
|
|
48
|
+
$ snapcrawl
|
|
49
|
+
Usage:
|
|
50
|
+
snapcrawl URL [--config FILE] [SETTINGS...]
|
|
51
|
+
snapcrawl -h | --help
|
|
52
|
+
snapcrawl -v | --version
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
The default configuration filename is `snapcrawl.yml`.
|
|
56
|
+
|
|
57
|
+
Using the `--config` flag will create a template configuration file if it is not present:
|
|
58
|
+
|
|
59
|
+
```shell
|
|
60
|
+
$ snapcrawl example.com --config snapcrawl
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
### Specifying options in the command line
|
|
64
|
+
|
|
65
|
+
All configuration options can be specified in the command line as `key=value` pairs:
|
|
66
|
+
|
|
67
|
+
```shell
|
|
68
|
+
$ snapcrawl example.com log_level=0 depth=2 width=1024
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
### Sample configuration file
|
|
72
|
+
|
|
73
|
+
```yaml
|
|
74
|
+
# All values below are the default values
|
|
75
|
+
|
|
76
|
+
# log level (0-4) 0=DEBUG 1=INFO 2=WARN 3=ERROR 4=FATAL
|
|
77
|
+
log_level: 1
|
|
78
|
+
|
|
79
|
+
# log_color (yes, no, auto)
|
|
80
|
+
# yes = always show log color
|
|
81
|
+
# no = never use colors
|
|
82
|
+
# auto = only use colors when running in an interactive terminal
|
|
83
|
+
log_color: auto
|
|
84
|
+
|
|
85
|
+
# number of levels to crawl, 0 means capture only the root URL
|
|
86
|
+
depth: 1
|
|
87
|
+
|
|
88
|
+
# screenshot width in pixels
|
|
89
|
+
width: 1280
|
|
90
|
+
|
|
91
|
+
# screenshot height in pixels, 0 means the entire height
|
|
92
|
+
height: 0
|
|
93
|
+
|
|
94
|
+
# number of seconds to consider the page cache and its screenshot fresh
|
|
95
|
+
cache_life: 86400
|
|
96
|
+
|
|
97
|
+
# where to store the HTML page cache
|
|
98
|
+
cache_dir: cache
|
|
99
|
+
|
|
100
|
+
# where to store screenshots
|
|
101
|
+
snaps_dir: snaps
|
|
102
|
+
|
|
103
|
+
# screenshot filename template, where '%{url}' will be replaced with a
|
|
104
|
+
# slug version of the URL (no need to include the .png extension)
|
|
105
|
+
name_template: '%{url}'
|
|
106
|
+
|
|
107
|
+
# urls not matching this regular expression will be ignored
|
|
108
|
+
url_whitelist:
|
|
109
|
+
|
|
110
|
+
# urls matching this regular expression will be ignored
|
|
111
|
+
url_blacklist:
|
|
112
|
+
|
|
113
|
+
# take a screenshot of this CSS selector only
|
|
114
|
+
css_selector:
|
|
115
|
+
|
|
116
|
+
# when true, ignore SSL related errors
|
|
117
|
+
skip_ssl_verification: false
|
|
118
|
+
|
|
119
|
+
# set to any number of seconds to wait for the page to load before taking
|
|
120
|
+
# a screenshot, leave empty to not wait at all (only needed for pages with
|
|
121
|
+
# animations or other post-load events).
|
|
122
|
+
screenshot_delay:
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
## Contributing / Support
|
|
126
|
+
If you experience any issue, have a question or a suggestion, or if you wish
|
|
127
|
+
to contribute, feel free to [open an issue][issues].
|
|
128
|
+
|
|
129
|
+
---
|
|
130
|
+
|
|
131
|
+
[1]: http://phantomjs.org/download.html
|
|
132
|
+
[2]: https://imagemagick.org/script/download.php
|
|
133
|
+
[3]: https://github.com/DannyBen/docker-snapcrawl
|
|
134
|
+
[issues]: https://github.com/DannyBen/snapcrawl/issues
|
|
135
|
+
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
require 'snapcrawl/version'
|
|
2
|
+
|
|
3
|
+
title 'Snapcrawl Runfile'
|
|
4
|
+
summary 'Runfile tasks for building the Snapcrawl gem'
|
|
5
|
+
version Snapcrawl::VERSION
|
|
6
|
+
|
|
7
|
+
import_gem 'runfile-tasks/gem'
|
|
8
|
+
import 'debug'
|
|
9
|
+
|
|
10
|
+
help "Regenerate the command line output in the README file"
|
|
11
|
+
action :patchme do
|
|
12
|
+
readme = File.read 'README.md'
|
|
13
|
+
usage = `bundle exec snapcrawl -h`
|
|
14
|
+
usage.gsub!(/^/, " ")
|
|
15
|
+
readme.gsub!(/(\$ snapcrawl --help)(.*)(---\s*)/m) { "#{$1}\n\n#{usage}\n#{$3}" }
|
|
16
|
+
File.write "README.md", readme
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
help "Generate changelog and append old changelog"
|
|
20
|
+
action :changelog do
|
|
21
|
+
system "git changelog --save"
|
|
22
|
+
# append older changelog (prior to switching to git-changelog)
|
|
23
|
+
system "cat .changelog.old.md >> CHANGELOG.md"
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
usage "mockserver"
|
|
27
|
+
help "Start the mock server"
|
|
28
|
+
action :mockserver do
|
|
29
|
+
Dir.chdir 'spec/server' do
|
|
30
|
+
system 'rackup -p 3000 -o 0.0.0.0'
|
|
31
|
+
end
|
|
32
|
+
rescue Interrupt
|
|
33
|
+
abort "\rBye"
|
|
34
|
+
end
|
|
35
|
+
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
|
|
3
|
+
require 'snapcrawl'
|
|
4
|
+
require 'colsole'
|
|
5
|
+
|
|
6
|
+
trap(:INT) { abort "\r\nGoodbye" }
|
|
7
|
+
|
|
8
|
+
include Snapcrawl
|
|
9
|
+
include Colsole
|
|
10
|
+
|
|
11
|
+
begin
|
|
12
|
+
CLI.new.call ARGV
|
|
13
|
+
rescue MissingPhantomJS => e
|
|
14
|
+
message = 'Cannot find phantomjs executable in the path, please install it first.'
|
|
15
|
+
say! "\n\nru`#{e.class}`\n#{message}"
|
|
16
|
+
exit 2
|
|
17
|
+
rescue MissingImageMagick => e
|
|
18
|
+
message = 'Cannot find convert (ImageMagick) executable in the path, please install it first.'
|
|
19
|
+
say! "\n\nru`#{e.class}`\n#{message}"
|
|
20
|
+
exit 3
|
|
21
|
+
rescue => e
|
|
22
|
+
puts e.backtrace.reverse if ENV['DEBUG']
|
|
23
|
+
say! "\nru`#{e.class}`\n#{e.message}"
|
|
24
|
+
exit 1
|
|
25
|
+
end
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
require 'colsole'
|
|
2
|
+
require 'docopt'
|
|
3
|
+
require 'fileutils'
|
|
4
|
+
|
|
5
|
+
module Snapcrawl
|
|
6
|
+
class CLI
|
|
7
|
+
include Colsole
|
|
8
|
+
using StringRefinements
|
|
9
|
+
using PairSplit
|
|
10
|
+
|
|
11
|
+
def call(args = [])
|
|
12
|
+
execute Docopt.docopt(docopt, version: VERSION, argv: args)
|
|
13
|
+
rescue Docopt::Exit => e
|
|
14
|
+
puts e.message
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
private
|
|
18
|
+
|
|
19
|
+
def execute(args)
|
|
20
|
+
config_file = args['--config']
|
|
21
|
+
Config.load config_file if config_file
|
|
22
|
+
|
|
23
|
+
tweaks = args['SETTINGS'].pair_split
|
|
24
|
+
apply_tweaks tweaks if tweaks
|
|
25
|
+
|
|
26
|
+
Dependencies.verify
|
|
27
|
+
|
|
28
|
+
$logger.debug 'initializing cli'
|
|
29
|
+
FileUtils.mkdir_p Config.snaps_dir
|
|
30
|
+
|
|
31
|
+
url = args['URL'].protocolize
|
|
32
|
+
crawler = Crawler.new url
|
|
33
|
+
|
|
34
|
+
crawler.crawl
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def docopt
|
|
38
|
+
@docopt ||= File.read docopt_path
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def docopt_path
|
|
42
|
+
File.expand_path 'templates/docopt.txt', __dir__
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def apply_tweaks(tweaks)
|
|
46
|
+
tweaks.each do |key, value|
|
|
47
|
+
Config.settings[key] = value
|
|
48
|
+
$logger.level = value if key == 'log_level'
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
end
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
require 'sting'
|
|
2
|
+
require 'fileutils'
|
|
3
|
+
|
|
4
|
+
module Snapcrawl
|
|
5
|
+
class Config < Sting
|
|
6
|
+
class << self
|
|
7
|
+
def load(file = nil)
|
|
8
|
+
reset!
|
|
9
|
+
push defaults
|
|
10
|
+
|
|
11
|
+
return unless file
|
|
12
|
+
|
|
13
|
+
file = "#{file}.yml" unless /\.ya?ml$/.match?(file)
|
|
14
|
+
|
|
15
|
+
# FIXME: Cannot use logger here due to the "chicken and egg" with
|
|
16
|
+
# Config. The $logger is available, but it was not yet fully
|
|
17
|
+
# configured with log_level etc.
|
|
18
|
+
if File.exist? file
|
|
19
|
+
# $logger.debug "loading config file g`#{file}`"
|
|
20
|
+
push file
|
|
21
|
+
else
|
|
22
|
+
# $logger.debug "creating config file g`#{file}`"
|
|
23
|
+
create_config file
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
private
|
|
28
|
+
|
|
29
|
+
def defaults
|
|
30
|
+
{
|
|
31
|
+
depth: 1,
|
|
32
|
+
width: 1280,
|
|
33
|
+
height: 0,
|
|
34
|
+
cache_life: 86_400,
|
|
35
|
+
cache_dir: 'cache',
|
|
36
|
+
snaps_dir: 'snaps',
|
|
37
|
+
name_template: '%{url}',
|
|
38
|
+
url_whitelist: nil,
|
|
39
|
+
url_blacklist: nil,
|
|
40
|
+
css_selector: nil,
|
|
41
|
+
log_level: 1,
|
|
42
|
+
log_color: 'auto',
|
|
43
|
+
skip_ssl_verification: false,
|
|
44
|
+
screenshot_delay: nil,
|
|
45
|
+
}
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def create_config(file)
|
|
49
|
+
content = File.read config_template
|
|
50
|
+
dir = File.dirname file
|
|
51
|
+
FileUtils.mkdir_p dir
|
|
52
|
+
File.write file, content
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def config_template
|
|
56
|
+
File.expand_path 'templates/config.yml', __dir__
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
end
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
require 'fileutils'
|
|
2
|
+
|
|
3
|
+
module Snapcrawl
|
|
4
|
+
class Crawler
|
|
5
|
+
using StringRefinements
|
|
6
|
+
|
|
7
|
+
attr_reader :url
|
|
8
|
+
|
|
9
|
+
def initialize(url)
|
|
10
|
+
$logger.debug "initializing crawler with g`#{url}`"
|
|
11
|
+
|
|
12
|
+
config_for_display = Config.settings.dup
|
|
13
|
+
config_for_display['name_template'] = '%%{url}'
|
|
14
|
+
|
|
15
|
+
$logger.debug "config #{config_for_display}"
|
|
16
|
+
@url = url
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def crawl
|
|
20
|
+
Dependencies.verify
|
|
21
|
+
todo[url] = Page.new url
|
|
22
|
+
process_todo while todo.any?
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
private
|
|
26
|
+
|
|
27
|
+
def process_todo
|
|
28
|
+
$logger.debug "processing queue: g`#{todo.count} remaining`"
|
|
29
|
+
|
|
30
|
+
url, page = todo.shift
|
|
31
|
+
done.push url
|
|
32
|
+
|
|
33
|
+
return unless process_page page
|
|
34
|
+
|
|
35
|
+
register_sub_pages page.pages if page.depth < Config.depth
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def register_sub_pages(pages)
|
|
39
|
+
pages.each do |sub_page|
|
|
40
|
+
next if todo.has_key?(sub_page) || done.include?(sub_page)
|
|
41
|
+
|
|
42
|
+
if Config.url_whitelist && sub_page.path !~ (/#{Config.url_whitelist}/)
|
|
43
|
+
$logger.debug "ignoring mu`#{sub_page.url}`, reason: whitelist"
|
|
44
|
+
next
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
if Config.url_blacklist && sub_page.path =~ (/#{Config.url_blacklist}/)
|
|
48
|
+
$logger.debug "ignoring mu`#{sub_page.url}`, reason: blacklist"
|
|
49
|
+
next
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
todo[sub_page.url] = sub_page
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def process_page(page)
|
|
57
|
+
outfile = "#{Config.snaps_dir}/#{Config.name_template}.png" % { url: page.url.to_slug }
|
|
58
|
+
|
|
59
|
+
$logger.info "processing mu`#{page.url}`, depth: #{page.depth}"
|
|
60
|
+
|
|
61
|
+
unless page.valid?
|
|
62
|
+
$logger.debug "page #{page.path} is invalid, aborting process"
|
|
63
|
+
return false
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
if file_fresh? outfile
|
|
67
|
+
$logger.info "screenshot for #{page.path} already exists"
|
|
68
|
+
else
|
|
69
|
+
$logger.info "gb`capturing screenshot for #{page.path}`"
|
|
70
|
+
save_screenshot page, outfile
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
true
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
def save_screenshot(page, outfile)
|
|
77
|
+
page.save_screenshot outfile
|
|
78
|
+
rescue => e
|
|
79
|
+
$logger.error "screenshot error on mu`#{page.path}` - r`#{e.class}`: #{e.message}"
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
def file_fresh?(file)
|
|
83
|
+
Config.cache_life.positive? and File.exist?(file) and file_age(file) < Config.cache_life
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
def file_age(file)
|
|
87
|
+
(Time.now - File.stat(file).mtime).to_i
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
def todo
|
|
91
|
+
@todo ||= {}
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
def done
|
|
95
|
+
@done ||= []
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
end
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
require 'colsole'
|
|
2
|
+
|
|
3
|
+
module Snapcrawl
|
|
4
|
+
class Dependencies
|
|
5
|
+
class << self
|
|
6
|
+
include Colsole
|
|
7
|
+
|
|
8
|
+
def verify
|
|
9
|
+
return if @verified
|
|
10
|
+
|
|
11
|
+
$logger.debug 'verifying g`phantomjs` is present'
|
|
12
|
+
raise MissingPhantomJS unless command_exist? 'phantomjs'
|
|
13
|
+
|
|
14
|
+
$logger.debug 'verifying g`imagemagick` is present'
|
|
15
|
+
raise MissingImageMagick unless command_exist? 'convert'
|
|
16
|
+
|
|
17
|
+
@verified = true
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
require 'colsole'
|
|
2
|
+
|
|
3
|
+
module Snapcrawl
|
|
4
|
+
module LogHelpers
|
|
5
|
+
include Colsole
|
|
6
|
+
|
|
7
|
+
SEVERITY_COLORS = {
|
|
8
|
+
'INFO' => :b,
|
|
9
|
+
'WARN' => :y,
|
|
10
|
+
'ERROR' => :r,
|
|
11
|
+
'FATAL' => :r,
|
|
12
|
+
'DEBUG' => :c,
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
def log_formatter
|
|
16
|
+
proc do |severity, _time, _prog, message|
|
|
17
|
+
severity_color = SEVERITY_COLORS[severity]
|
|
18
|
+
line = "#{severity_color}`#{severity.rjust 5}` : #{message}\n"
|
|
19
|
+
use_colors? ? colorize(line) : strip_colors(line)
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def use_colors?
|
|
24
|
+
@use_colors ||= (Config.log_color == 'auto' ? tty? : Config.log_color)
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def tty?
|
|
28
|
+
case ENV['TTY']
|
|
29
|
+
when 'on' then true
|
|
30
|
+
when 'off' then false
|
|
31
|
+
else
|
|
32
|
+
$stdout.tty?
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
end
|