middleman-crawler 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +34 -0
- data/Gemfile +14 -0
- data/LICENSE +21 -0
- data/README.md +61 -0
- data/Rakefile +3 -0
- data/features/support/env.rb +4 -0
- data/lib/middleman-crawler.rb +8 -0
- data/lib/middleman-crawler/commands.rb +169 -0
- data/lib/middleman-crawler/extension.rb +31 -0
- data/lib/middleman-extension.rb +1 -0
- data/middleman-crawler.gemspec +27 -0
- metadata +101 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 304f53a614e91d793893025e8c7e9e1bd97de1a6
|
4
|
+
data.tar.gz: 991befa646235f146eb996f0608786130b62095a
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 785c85c945369332430d67a6bd139103eeadcfb6d166baae3aead4b21576247ad1764a8115516d9d4f2dc35df6c78b8d44921ee14f3879f97f3461f73b18ef97
|
7
|
+
data.tar.gz: c7e800ab263c448a8526245e5c0e8c3a7c8302c4d584735c77748a99af37f147294fa7e85684307e585e671eeea4dd377c24dbf0897250442f49dfb6dfbb7d7a
|
data/.gitignore
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
*.gem
|
2
|
+
*.rbc
|
3
|
+
/.config
|
4
|
+
/coverage/
|
5
|
+
/InstalledFiles
|
6
|
+
/pkg/
|
7
|
+
/spec/reports/
|
8
|
+
/spec/examples.txt
|
9
|
+
/test/tmp/
|
10
|
+
/test/version_tmp/
|
11
|
+
/tmp/
|
12
|
+
|
13
|
+
# Used by dotenv library to load environment variables.
|
14
|
+
# .env
|
15
|
+
|
16
|
+
## Documentation cache and generated files:
|
17
|
+
/.yardoc/
|
18
|
+
/_yardoc/
|
19
|
+
/doc/
|
20
|
+
/rdoc/
|
21
|
+
|
22
|
+
## Environment normalization:
|
23
|
+
/.bundle/
|
24
|
+
/vendor/bundle
|
25
|
+
/lib/bundler/man/
|
26
|
+
|
27
|
+
# for a library or gem, you might want to ignore these files since the code is
|
28
|
+
# intended to run in multiple environments; otherwise, check them in:
|
29
|
+
Gemfile.lock
|
30
|
+
.ruby-version
|
31
|
+
.ruby-gemset
|
32
|
+
|
33
|
+
# unless supporting rvm < 1.11.0 or doing something fancy, ignore this:
|
34
|
+
.rvmrc
|
data/Gemfile
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2016 Filippo Gangi Dino
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,61 @@
|
|
1
|
+
# Middleman::Crawler
|
2
|
+
|
3
|
+
Add a crawler CLI to crawl your middleman site.
|
4
|
+
|
5
|
+
[![Gem Version](https://badge.fury.io/rb/middleman-crawler.png)](http://badge.fury.io/rb/middleman-crawler)
|
6
|
+
|
7
|
+
## Installation
|
8
|
+
|
9
|
+
Add this line to your application's Gemfile:
|
10
|
+
|
11
|
+
gem 'middleman-crawler'
|
12
|
+
|
13
|
+
And then execute:
|
14
|
+
|
15
|
+
$ bundle
|
16
|
+
|
17
|
+
Or install it yourself as:
|
18
|
+
|
19
|
+
$ gem install middleman-crawler
|
20
|
+
|
21
|
+
## Usage
|
22
|
+
|
23
|
+
First you need to activate the module in your `config.rb`.
|
24
|
+
|
25
|
+
```
|
26
|
+
activate :crawler
|
27
|
+
```
|
28
|
+
|
29
|
+
After you must declare in your `environments/*.rb` this config var:
|
30
|
+
|
31
|
+
```
|
32
|
+
config[:base_url] = "http://someurl.here"
|
33
|
+
```
|
34
|
+
|
35
|
+
This extension use [Rawler Gem](https://github.com/oscardelben/rawler) and implements the same interface with one more option `environment`.
|
36
|
+
|
37
|
+
In CLI you can write `middleman crawler http://example.com [options]`
|
38
|
+
|
39
|
+
where [options] are:
|
40
|
+
--environment, -e <s>: The Middleman Enviroment
|
41
|
+
--username, -u <s>: HTTP Basic Username
|
42
|
+
--password, -p <s>: HTTP Basic Password
|
43
|
+
--wait, -w <f>: Seconds to wait between requests, may be fractional e.g. '1.5' (default: 3.0)
|
44
|
+
--log, -l: Log results to file rawler_log.txt
|
45
|
+
--logfile, -o <s>: Specify logfile, implies --log (default: rawler_log.txt)
|
46
|
+
--css, -c: Check CSS links
|
47
|
+
--skip, -s <s>: Skip URLs that match a regexp
|
48
|
+
--iskip, -i <s>: Skip URLs that match a case insensitive regexp
|
49
|
+
--include <s>: Only include URLs that match a regexp
|
50
|
+
--iinclude <s>: Only include URLs that match a case insensitive regexp
|
51
|
+
--local <s>: Restrict to the given URL and below. Equivalent to '--include ^http://mysite.com/*'.
|
52
|
+
--ignore_fragments: Strips any fragment from parsed links
|
53
|
+
|
54
|
+
|
55
|
+
## Contributing
|
56
|
+
|
57
|
+
1. Fork it ( https://github.com/welaika/middleman-crawler )
|
58
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
59
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
60
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
61
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1,169 @@
|
|
1
|
+
require 'middleman-core/cli'
|
2
|
+
require 'middleman-core/rack' if Middleman::VERSION.to_i > 3
|
3
|
+
require 'middleman-crawler/extension'
|
4
|
+
|
5
|
+
require 'rawler'
|
6
|
+
|
7
|
+
module Middleman
|
8
|
+
module Cli
|
9
|
+
# This class provides a "crawler" command for the middleman CLI.
|
10
|
+
class CrawlerIndex < Thor::Group
|
11
|
+
include Thor::Actions
|
12
|
+
|
13
|
+
check_unknown_options!
|
14
|
+
|
15
|
+
namespace :crawler
|
16
|
+
|
17
|
+
class_option :environment,
|
18
|
+
aliases: '-e',
|
19
|
+
type: :string,
|
20
|
+
default: ENV['MM_ENV'] || ENV['RACK_ENV'] || 'development',
|
21
|
+
desc: 'The environment Middleman will run under'
|
22
|
+
|
23
|
+
class_option :username,
|
24
|
+
aliases: '-u',
|
25
|
+
type: :string,
|
26
|
+
desc: 'HTTP Basic Username'
|
27
|
+
|
28
|
+
class_option :password,
|
29
|
+
aliases: '-p',
|
30
|
+
type: :string,
|
31
|
+
desc: 'HTTP Basic Password'
|
32
|
+
|
33
|
+
class_option :skip,
|
34
|
+
aliases: '-s',
|
35
|
+
type: :string,
|
36
|
+
default: "\#$",
|
37
|
+
desc: 'Skip URLs that match a regexp'
|
38
|
+
|
39
|
+
class_option :wait,
|
40
|
+
aliases: '-w',
|
41
|
+
type: :numeric,
|
42
|
+
default: 0.1,
|
43
|
+
desc: "Seconds to wait between requests, may be fractional e.g. '1.5' (default: 3.0)"
|
44
|
+
|
45
|
+
class_option :css,
|
46
|
+
aliases: '-c',
|
47
|
+
type: :boolean,
|
48
|
+
default: true,
|
49
|
+
desc: "Check CSS links"
|
50
|
+
|
51
|
+
class_option :local,
|
52
|
+
type: :boolean,
|
53
|
+
default: true,
|
54
|
+
desc: "Restrict to the given URL and below. Equivalent to '--include ^http://mysite.com/*'."
|
55
|
+
|
56
|
+
class_option :log,
|
57
|
+
aliases: '-l',
|
58
|
+
type: :boolean,
|
59
|
+
desc: "Log results to file rawler_log.txt"
|
60
|
+
|
61
|
+
class_option :logfile,
|
62
|
+
aliases: '-o',
|
63
|
+
type: :string,
|
64
|
+
desc: "Specify logfile, implies --log (default: rawler_log.txt)"
|
65
|
+
|
66
|
+
class_option :iskip,
|
67
|
+
aliases: '-i',
|
68
|
+
type: :string,
|
69
|
+
desc: "Skip URLs that match a case insensitive regexp"
|
70
|
+
|
71
|
+
class_option :include,
|
72
|
+
type: :string,
|
73
|
+
desc: "Only include URLs that match a regexp"
|
74
|
+
|
75
|
+
class_option :iinclude,
|
76
|
+
type: :string,
|
77
|
+
desc: "Only include URLs that match a case insensitive regexp"
|
78
|
+
|
79
|
+
class_option :ignore_fragments,
|
80
|
+
type: :boolean,
|
81
|
+
desc: "Strips any fragment from parsed links"
|
82
|
+
|
83
|
+
|
84
|
+
# Tell Thor to exit with a nonzero exit code on failure
|
85
|
+
def self.exit_on_failure?
|
86
|
+
true
|
87
|
+
end
|
88
|
+
|
89
|
+
def crawler
|
90
|
+
configs = get_configs
|
91
|
+
|
92
|
+
env_config = File.join(Middleman::Application.root, 'environments', "#{configs[:environment]}.rb")
|
93
|
+
return unless File.exist? env_config
|
94
|
+
|
95
|
+
configurations = Configure.new
|
96
|
+
|
97
|
+
configurations.instance_eval File.read(env_config), env_config, 1
|
98
|
+
|
99
|
+
response = {}
|
100
|
+
raw = Rawler::Base.new(configurations.config[:base_url], $stdout, rawler_options)
|
101
|
+
raw.validate
|
102
|
+
response['errors'] = raw.responses.reject{ |link, response|
|
103
|
+
(100..399).include?(response[:status].to_i)
|
104
|
+
}
|
105
|
+
say("There are some errors: #{errors_message_for(response['errors'])}", :red) if response['errors']
|
106
|
+
puts "All crawling actions ok" unless response['errors']
|
107
|
+
end
|
108
|
+
|
109
|
+
private
|
110
|
+
|
111
|
+
def get_configs
|
112
|
+
configs ||= {
|
113
|
+
environment: (options['environment'].presence || "development").to_sym,
|
114
|
+
username: options['username'],
|
115
|
+
password: options['password'],
|
116
|
+
skip: options['skip'],
|
117
|
+
wait: options['wait'],
|
118
|
+
css: options['css'],
|
119
|
+
local: options['local'],
|
120
|
+
log: options['log'],
|
121
|
+
logfile: options['logfile'],
|
122
|
+
iskip: options['iskip'],
|
123
|
+
include: options['include'],
|
124
|
+
iinclude: options['iinclude'],
|
125
|
+
ignore_fragments: options['ignore_fragments']
|
126
|
+
}
|
127
|
+
end
|
128
|
+
|
129
|
+
def rawler_options
|
130
|
+
opts = {}
|
131
|
+
opts[:username] = get_configs[:username].to_s if get_configs[:username].present?
|
132
|
+
opts[:password] = get_configs[:password].to_s if get_configs[:password].present?
|
133
|
+
opts[:skip] = get_configs[:skip].present? ? get_configs[:skip].to_s : "\#$"
|
134
|
+
opts[:wait] = get_configs[:wait].present? ? get_configs[:wait].to_f : 0.1
|
135
|
+
opts[:css] = get_configs[:css] != nil ? get_configs[:css] : true
|
136
|
+
opts[:local] = get_configs[:local] != nil ? get_configs[:local] : true
|
137
|
+
opts[:log] = get_configs[:log] if get_configs[:log] != nil
|
138
|
+
opts[:logfile] = get_configs[:logfile].to_s if get_configs[:logfile].present?
|
139
|
+
opts[:iskip] = get_configs[:iskip].to_s if get_configs[:iskip].present?
|
140
|
+
opts[:include] = get_configs[:include].to_s if get_configs[:include].present?
|
141
|
+
opts[:iinclude] = get_configs[:iinclude].to_s if get_configs[:iskip].present?
|
142
|
+
opts[:ignore_fragments] = get_configs[:ignore_fragments] if get_configs[:ignore_fragments] != nil
|
143
|
+
opts
|
144
|
+
end
|
145
|
+
|
146
|
+
def errors_message_for(responses)
|
147
|
+
collection = responses.each_with_object([]) do |(url, response), a|
|
148
|
+
a << "url: #{url}, error_code: #{response[:status]}"
|
149
|
+
end
|
150
|
+
|
151
|
+
collection.join("; ")
|
152
|
+
|
153
|
+
end
|
154
|
+
|
155
|
+
class Configure
|
156
|
+
|
157
|
+
attr_reader :config
|
158
|
+
|
159
|
+
def initialize
|
160
|
+
@config = {}
|
161
|
+
end
|
162
|
+
end
|
163
|
+
|
164
|
+
end
|
165
|
+
|
166
|
+
# Add to CLI
|
167
|
+
Base.register(Middleman::Cli::CrawlerIndex, 'crawler', 'crawler [options]', "Starts a crawler")
|
168
|
+
end
|
169
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
# Require core library
|
2
|
+
require 'middleman-core'
|
3
|
+
|
4
|
+
# Extension namespace
|
5
|
+
class Crawler < ::Middleman::Extension
|
6
|
+
|
7
|
+
expose_to_template :crawler
|
8
|
+
|
9
|
+
def initialize(app, options_hash={}, &block)
|
10
|
+
# Call super to build options from the options_hash
|
11
|
+
super
|
12
|
+
|
13
|
+
# Require libraries only when activated
|
14
|
+
# require 'necessary/library'
|
15
|
+
|
16
|
+
# set up your extension
|
17
|
+
end
|
18
|
+
|
19
|
+
def after_configuration
|
20
|
+
# Do something
|
21
|
+
end
|
22
|
+
|
23
|
+
# A Sitemap Manipulator
|
24
|
+
# def manipulate_resource_list(resources)
|
25
|
+
# end
|
26
|
+
|
27
|
+
# helpers do
|
28
|
+
# def a_helper
|
29
|
+
# end
|
30
|
+
# end
|
31
|
+
end
|
@@ -0,0 +1 @@
|
|
1
|
+
require 'middleman-crawler'
|
@@ -0,0 +1,27 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
|
4
|
+
Gem::Specification.new do |s|
|
5
|
+
s.name = "middleman-crawler"
|
6
|
+
s.version = "0.0.1"
|
7
|
+
s.platform = Gem::Platform::RUBY
|
8
|
+
s.authors = ["weLaika", "Filippo Gangi Dino", "Fabrizio Monti"]
|
9
|
+
s.email = ["info@welaika.com"]
|
10
|
+
s.homepage = "https://github.com/welaika/middleman-crawler"
|
11
|
+
s.summary = %q{crawler for Middleman site}
|
12
|
+
s.description = %q{it starts a crawler for Middleman sites}
|
13
|
+
|
14
|
+
s.files = `git ls-files`.split("\n")
|
15
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
16
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
17
|
+
s.require_paths = ["lib"]
|
18
|
+
s.license = 'MIT'
|
19
|
+
|
20
|
+
# The version of middleman-core your extension depends on
|
21
|
+
s.add_runtime_dependency "middleman-core", [">= 4.1.9"]
|
22
|
+
|
23
|
+
# Additional dependencies
|
24
|
+
s.add_runtime_dependency "rawler", "~> 0.1.8"
|
25
|
+
|
26
|
+
s.add_dependency 'bundler', '~> 1.3'
|
27
|
+
end
|
metadata
ADDED
@@ -0,0 +1,101 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: middleman-crawler
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- weLaika
|
8
|
+
- Filippo Gangi Dino
|
9
|
+
- Fabrizio Monti
|
10
|
+
autorequire:
|
11
|
+
bindir: bin
|
12
|
+
cert_chain: []
|
13
|
+
date: 2016-07-05 00:00:00.000000000 Z
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: middleman-core
|
17
|
+
requirement: !ruby/object:Gem::Requirement
|
18
|
+
requirements:
|
19
|
+
- - ">="
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: 4.1.9
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
requirements:
|
26
|
+
- - ">="
|
27
|
+
- !ruby/object:Gem::Version
|
28
|
+
version: 4.1.9
|
29
|
+
- !ruby/object:Gem::Dependency
|
30
|
+
name: rawler
|
31
|
+
requirement: !ruby/object:Gem::Requirement
|
32
|
+
requirements:
|
33
|
+
- - "~>"
|
34
|
+
- !ruby/object:Gem::Version
|
35
|
+
version: 0.1.8
|
36
|
+
type: :runtime
|
37
|
+
prerelease: false
|
38
|
+
version_requirements: !ruby/object:Gem::Requirement
|
39
|
+
requirements:
|
40
|
+
- - "~>"
|
41
|
+
- !ruby/object:Gem::Version
|
42
|
+
version: 0.1.8
|
43
|
+
- !ruby/object:Gem::Dependency
|
44
|
+
name: bundler
|
45
|
+
requirement: !ruby/object:Gem::Requirement
|
46
|
+
requirements:
|
47
|
+
- - "~>"
|
48
|
+
- !ruby/object:Gem::Version
|
49
|
+
version: '1.3'
|
50
|
+
type: :runtime
|
51
|
+
prerelease: false
|
52
|
+
version_requirements: !ruby/object:Gem::Requirement
|
53
|
+
requirements:
|
54
|
+
- - "~>"
|
55
|
+
- !ruby/object:Gem::Version
|
56
|
+
version: '1.3'
|
57
|
+
description: it starts a crawler for Middleman sites
|
58
|
+
email:
|
59
|
+
- info@welaika.com
|
60
|
+
executables: []
|
61
|
+
extensions: []
|
62
|
+
extra_rdoc_files: []
|
63
|
+
files:
|
64
|
+
- ".gitignore"
|
65
|
+
- Gemfile
|
66
|
+
- LICENSE
|
67
|
+
- README.md
|
68
|
+
- Rakefile
|
69
|
+
- features/support/env.rb
|
70
|
+
- lib/middleman-crawler.rb
|
71
|
+
- lib/middleman-crawler/commands.rb
|
72
|
+
- lib/middleman-crawler/extension.rb
|
73
|
+
- lib/middleman-extension.rb
|
74
|
+
- middleman-crawler.gemspec
|
75
|
+
homepage: https://github.com/welaika/middleman-crawler
|
76
|
+
licenses:
|
77
|
+
- MIT
|
78
|
+
metadata: {}
|
79
|
+
post_install_message:
|
80
|
+
rdoc_options: []
|
81
|
+
require_paths:
|
82
|
+
- lib
|
83
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
84
|
+
requirements:
|
85
|
+
- - ">="
|
86
|
+
- !ruby/object:Gem::Version
|
87
|
+
version: '0'
|
88
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
89
|
+
requirements:
|
90
|
+
- - ">="
|
91
|
+
- !ruby/object:Gem::Version
|
92
|
+
version: '0'
|
93
|
+
requirements: []
|
94
|
+
rubyforge_project:
|
95
|
+
rubygems_version: 2.5.1
|
96
|
+
signing_key:
|
97
|
+
specification_version: 4
|
98
|
+
summary: crawler for Middleman site
|
99
|
+
test_files:
|
100
|
+
- features/support/env.rb
|
101
|
+
has_rdoc:
|