middleman-crawler 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 304f53a614e91d793893025e8c7e9e1bd97de1a6
4
+ data.tar.gz: 991befa646235f146eb996f0608786130b62095a
5
+ SHA512:
6
+ metadata.gz: 785c85c945369332430d67a6bd139103eeadcfb6d166baae3aead4b21576247ad1764a8115516d9d4f2dc35df6c78b8d44921ee14f3879f97f3461f73b18ef97
7
+ data.tar.gz: c7e800ab263c448a8526245e5c0e8c3a7c8302c4d584735c77748a99af37f147294fa7e85684307e585e671eeea4dd377c24dbf0897250442f49dfb6dfbb7d7a
data/.gitignore ADDED
@@ -0,0 +1,34 @@
1
+ *.gem
2
+ *.rbc
3
+ /.config
4
+ /coverage/
5
+ /InstalledFiles
6
+ /pkg/
7
+ /spec/reports/
8
+ /spec/examples.txt
9
+ /test/tmp/
10
+ /test/version_tmp/
11
+ /tmp/
12
+
13
+ # Used by dotenv library to load environment variables.
14
+ # .env
15
+
16
+ ## Documentation cache and generated files:
17
+ /.yardoc/
18
+ /_yardoc/
19
+ /doc/
20
+ /rdoc/
21
+
22
+ ## Environment normalization:
23
+ /.bundle/
24
+ /vendor/bundle
25
+ /lib/bundler/man/
26
+
27
+ # for a library or gem, you might want to ignore these files since the code is
28
+ # intended to run in multiple environments; otherwise, check them in:
29
+ Gemfile.lock
30
+ .ruby-version
31
+ .ruby-gemset
32
+
33
+ # unless supporting rvm < 1.11.0 or doing something fancy, ignore this:
34
+ .rvmrc
data/Gemfile ADDED
@@ -0,0 +1,14 @@
1
+ # If you do not have OpenSSL installed, update
2
+ # the following line to use "http://" instead
3
+ source 'https://rubygems.org'
4
+
5
+ gemspec
6
+
7
+ group :development do
8
+ gem 'rake'
9
+ gem 'rdoc'
10
+ gem 'yard'
11
+ end
12
+
13
+ group :test do
14
+ end
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2016 Filippo Gangi Dino
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,61 @@
1
+ # Middleman::Crawler
2
+
3
+ Add a crawler CLI to crawl your middleman site.
4
+
5
+ [![Gem Version](https://badge.fury.io/rb/middleman-crawler.png)](http://badge.fury.io/rb/middleman-crawler)
6
+
7
+ ## Installation
8
+
9
+ Add this line to your application's Gemfile:
10
+
11
+ gem 'middleman-crawler'
12
+
13
+ And then execute:
14
+
15
+ $ bundle
16
+
17
+ Or install it yourself as:
18
+
19
+ $ gem install middleman-crawler
20
+
21
+ ## Usage
22
+
23
+ First you need to activate the module in your `config.rb`.
24
+
25
+ ```
26
+ activate :crawler
27
+ ```
28
+
29
+ After you must declare in your `environments/*.rb` this config var:
30
+
31
+ ```
32
+ config[:base_url] = "http://someurl.here"
33
+ ```
34
+
35
+ This extension use [Rawler Gem](https://github.com/oscardelben/rawler) and implements the same interface with one more option `environment`.
36
+
37
+ In CLI you can write `middleman crawler http://example.com [options]`
38
+
39
+ where [options] are:
40
+ --environment, -e <s>: The Middleman Enviroment
41
+ --username, -u <s>: HTTP Basic Username
42
+ --password, -p <s>: HTTP Basic Password
43
+ --wait, -w <f>: Seconds to wait between requests, may be fractional e.g. '1.5' (default: 3.0)
44
+ --log, -l: Log results to file rawler_log.txt
45
+ --logfile, -o <s>: Specify logfile, implies --log (default: rawler_log.txt)
46
+ --css, -c: Check CSS links
47
+ --skip, -s <s>: Skip URLs that match a regexp
48
+ --iskip, -i <s>: Skip URLs that match a case insensitive regexp
49
+ --include <s>: Only include URLs that match a regexp
50
+ --iinclude <s>: Only include URLs that match a case insensitive regexp
51
+ --local <s>: Restrict to the given URL and below. Equivalent to '--include ^http://mysite.com/*'.
52
+ --ignore_fragments: Strips any fragment from parsed links
53
+
54
+
55
+ ## Contributing
56
+
57
+ 1. Fork it ( https://github.com/welaika/middleman-crawler )
58
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
59
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
60
+ 4. Push to the branch (`git push origin my-new-feature`)
61
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,3 @@
1
+ require 'bundler'
2
+ Bundler.setup
3
+ Bundler::GemHelper.install_tasks
@@ -0,0 +1,4 @@
1
+ PROJECT_ROOT_PATH = File.dirname(File.dirname(File.dirname(__FILE__)))
2
+ require 'middleman-core'
3
+ require 'middleman-core/step_definitions'
4
+ require File.join(PROJECT_ROOT_PATH, 'lib', 'middleman-crawler')
@@ -0,0 +1,8 @@
1
+ require 'middleman-core'
2
+
3
+ require 'middleman-crawler/commands'
4
+
5
+ Middleman::Extensions.register(:crawler) do
6
+ require 'middleman-crawler/extension'
7
+ Crawler
8
+ end
@@ -0,0 +1,169 @@
1
+ require 'middleman-core/cli'
2
+ require 'middleman-core/rack' if Middleman::VERSION.to_i > 3
3
+ require 'middleman-crawler/extension'
4
+
5
+ require 'rawler'
6
+
7
+ module Middleman
8
+ module Cli
9
+ # This class provides a "crawler" command for the middleman CLI.
10
+ class CrawlerIndex < Thor::Group
11
+ include Thor::Actions
12
+
13
+ check_unknown_options!
14
+
15
+ namespace :crawler
16
+
17
+ class_option :environment,
18
+ aliases: '-e',
19
+ type: :string,
20
+ default: ENV['MM_ENV'] || ENV['RACK_ENV'] || 'development',
21
+ desc: 'The environment Middleman will run under'
22
+
23
+ class_option :username,
24
+ aliases: '-u',
25
+ type: :string,
26
+ desc: 'HTTP Basic Username'
27
+
28
+ class_option :password,
29
+ aliases: '-p',
30
+ type: :string,
31
+ desc: 'HTTP Basic Password'
32
+
33
+ class_option :skip,
34
+ aliases: '-s',
35
+ type: :string,
36
+ default: "\#$",
37
+ desc: 'Skip URLs that match a regexp'
38
+
39
+ class_option :wait,
40
+ aliases: '-w',
41
+ type: :numeric,
42
+ default: 0.1,
43
+ desc: "Seconds to wait between requests, may be fractional e.g. '1.5' (default: 3.0)"
44
+
45
+ class_option :css,
46
+ aliases: '-c',
47
+ type: :boolean,
48
+ default: true,
49
+ desc: "Check CSS links"
50
+
51
+ class_option :local,
52
+ type: :boolean,
53
+ default: true,
54
+ desc: "Restrict to the given URL and below. Equivalent to '--include ^http://mysite.com/*'."
55
+
56
+ class_option :log,
57
+ aliases: '-l',
58
+ type: :boolean,
59
+ desc: "Log results to file rawler_log.txt"
60
+
61
+ class_option :logfile,
62
+ aliases: '-o',
63
+ type: :string,
64
+ desc: "Specify logfile, implies --log (default: rawler_log.txt)"
65
+
66
+ class_option :iskip,
67
+ aliases: '-i',
68
+ type: :string,
69
+ desc: "Skip URLs that match a case insensitive regexp"
70
+
71
+ class_option :include,
72
+ type: :string,
73
+ desc: "Only include URLs that match a regexp"
74
+
75
+ class_option :iinclude,
76
+ type: :string,
77
+ desc: "Only include URLs that match a case insensitive regexp"
78
+
79
+ class_option :ignore_fragments,
80
+ type: :boolean,
81
+ desc: "Strips any fragment from parsed links"
82
+
83
+
84
+ # Tell Thor to exit with a nonzero exit code on failure
85
+ def self.exit_on_failure?
86
+ true
87
+ end
88
+
89
+ def crawler
90
+ configs = get_configs
91
+
92
+ env_config = File.join(Middleman::Application.root, 'environments', "#{configs[:environment]}.rb")
93
+ return unless File.exist? env_config
94
+
95
+ configurations = Configure.new
96
+
97
+ configurations.instance_eval File.read(env_config), env_config, 1
98
+
99
+ response = {}
100
+ raw = Rawler::Base.new(configurations.config[:base_url], $stdout, rawler_options)
101
+ raw.validate
102
+ response['errors'] = raw.responses.reject{ |link, response|
103
+ (100..399).include?(response[:status].to_i)
104
+ }
105
+ say("There are some errors: #{errors_message_for(response['errors'])}", :red) if response['errors']
106
+ puts "All crawling actions ok" unless response['errors']
107
+ end
108
+
109
+ private
110
+
111
+ def get_configs
112
+ configs ||= {
113
+ environment: (options['environment'].presence || "development").to_sym,
114
+ username: options['username'],
115
+ password: options['password'],
116
+ skip: options['skip'],
117
+ wait: options['wait'],
118
+ css: options['css'],
119
+ local: options['local'],
120
+ log: options['log'],
121
+ logfile: options['logfile'],
122
+ iskip: options['iskip'],
123
+ include: options['include'],
124
+ iinclude: options['iinclude'],
125
+ ignore_fragments: options['ignore_fragments']
126
+ }
127
+ end
128
+
129
+ def rawler_options
130
+ opts = {}
131
+ opts[:username] = get_configs[:username].to_s if get_configs[:username].present?
132
+ opts[:password] = get_configs[:password].to_s if get_configs[:password].present?
133
+ opts[:skip] = get_configs[:skip].present? ? get_configs[:skip].to_s : "\#$"
134
+ opts[:wait] = get_configs[:wait].present? ? get_configs[:wait].to_f : 0.1
135
+ opts[:css] = get_configs[:css] != nil ? get_configs[:css] : true
136
+ opts[:local] = get_configs[:local] != nil ? get_configs[:local] : true
137
+ opts[:log] = get_configs[:log] if get_configs[:log] != nil
138
+ opts[:logfile] = get_configs[:logfile].to_s if get_configs[:logfile].present?
139
+ opts[:iskip] = get_configs[:iskip].to_s if get_configs[:iskip].present?
140
+ opts[:include] = get_configs[:include].to_s if get_configs[:include].present?
141
+ opts[:iinclude] = get_configs[:iinclude].to_s if get_configs[:iskip].present?
142
+ opts[:ignore_fragments] = get_configs[:ignore_fragments] if get_configs[:ignore_fragments] != nil
143
+ opts
144
+ end
145
+
146
+ def errors_message_for(responses)
147
+ collection = responses.each_with_object([]) do |(url, response), a|
148
+ a << "url: #{url}, error_code: #{response[:status]}"
149
+ end
150
+
151
+ collection.join("; ")
152
+
153
+ end
154
+
155
+ class Configure
156
+
157
+ attr_reader :config
158
+
159
+ def initialize
160
+ @config = {}
161
+ end
162
+ end
163
+
164
+ end
165
+
166
+ # Add to CLI
167
+ Base.register(Middleman::Cli::CrawlerIndex, 'crawler', 'crawler [options]', "Starts a crawler")
168
+ end
169
+ end
@@ -0,0 +1,31 @@
1
+ # Require core library
2
+ require 'middleman-core'
3
+
4
+ # Extension namespace
5
+ class Crawler < ::Middleman::Extension
6
+
7
+ expose_to_template :crawler
8
+
9
+ def initialize(app, options_hash={}, &block)
10
+ # Call super to build options from the options_hash
11
+ super
12
+
13
+ # Require libraries only when activated
14
+ # require 'necessary/library'
15
+
16
+ # set up your extension
17
+ end
18
+
19
+ def after_configuration
20
+ # Do something
21
+ end
22
+
23
+ # A Sitemap Manipulator
24
+ # def manipulate_resource_list(resources)
25
+ # end
26
+
27
+ # helpers do
28
+ # def a_helper
29
+ # end
30
+ # end
31
+ end
@@ -0,0 +1 @@
1
+ require 'middleman-crawler'
@@ -0,0 +1,27 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+
4
+ Gem::Specification.new do |s|
5
+ s.name = "middleman-crawler"
6
+ s.version = "0.0.1"
7
+ s.platform = Gem::Platform::RUBY
8
+ s.authors = ["weLaika", "Filippo Gangi Dino", "Fabrizio Monti"]
9
+ s.email = ["info@welaika.com"]
10
+ s.homepage = "https://github.com/welaika/middleman-crawler"
11
+ s.summary = %q{crawler for Middleman site}
12
+ s.description = %q{it starts a crawler for Middleman sites}
13
+
14
+ s.files = `git ls-files`.split("\n")
15
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
16
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
17
+ s.require_paths = ["lib"]
18
+ s.license = 'MIT'
19
+
20
+ # The version of middleman-core your extension depends on
21
+ s.add_runtime_dependency "middleman-core", [">= 4.1.9"]
22
+
23
+ # Additional dependencies
24
+ s.add_runtime_dependency "rawler", "~> 0.1.8"
25
+
26
+ s.add_dependency 'bundler', '~> 1.3'
27
+ end
metadata ADDED
@@ -0,0 +1,101 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: middleman-crawler
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - weLaika
8
+ - Filippo Gangi Dino
9
+ - Fabrizio Monti
10
+ autorequire:
11
+ bindir: bin
12
+ cert_chain: []
13
+ date: 2016-07-05 00:00:00.000000000 Z
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: middleman-core
17
+ requirement: !ruby/object:Gem::Requirement
18
+ requirements:
19
+ - - ">="
20
+ - !ruby/object:Gem::Version
21
+ version: 4.1.9
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ requirements:
26
+ - - ">="
27
+ - !ruby/object:Gem::Version
28
+ version: 4.1.9
29
+ - !ruby/object:Gem::Dependency
30
+ name: rawler
31
+ requirement: !ruby/object:Gem::Requirement
32
+ requirements:
33
+ - - "~>"
34
+ - !ruby/object:Gem::Version
35
+ version: 0.1.8
36
+ type: :runtime
37
+ prerelease: false
38
+ version_requirements: !ruby/object:Gem::Requirement
39
+ requirements:
40
+ - - "~>"
41
+ - !ruby/object:Gem::Version
42
+ version: 0.1.8
43
+ - !ruby/object:Gem::Dependency
44
+ name: bundler
45
+ requirement: !ruby/object:Gem::Requirement
46
+ requirements:
47
+ - - "~>"
48
+ - !ruby/object:Gem::Version
49
+ version: '1.3'
50
+ type: :runtime
51
+ prerelease: false
52
+ version_requirements: !ruby/object:Gem::Requirement
53
+ requirements:
54
+ - - "~>"
55
+ - !ruby/object:Gem::Version
56
+ version: '1.3'
57
+ description: it starts a crawler for Middleman sites
58
+ email:
59
+ - info@welaika.com
60
+ executables: []
61
+ extensions: []
62
+ extra_rdoc_files: []
63
+ files:
64
+ - ".gitignore"
65
+ - Gemfile
66
+ - LICENSE
67
+ - README.md
68
+ - Rakefile
69
+ - features/support/env.rb
70
+ - lib/middleman-crawler.rb
71
+ - lib/middleman-crawler/commands.rb
72
+ - lib/middleman-crawler/extension.rb
73
+ - lib/middleman-extension.rb
74
+ - middleman-crawler.gemspec
75
+ homepage: https://github.com/welaika/middleman-crawler
76
+ licenses:
77
+ - MIT
78
+ metadata: {}
79
+ post_install_message:
80
+ rdoc_options: []
81
+ require_paths:
82
+ - lib
83
+ required_ruby_version: !ruby/object:Gem::Requirement
84
+ requirements:
85
+ - - ">="
86
+ - !ruby/object:Gem::Version
87
+ version: '0'
88
+ required_rubygems_version: !ruby/object:Gem::Requirement
89
+ requirements:
90
+ - - ">="
91
+ - !ruby/object:Gem::Version
92
+ version: '0'
93
+ requirements: []
94
+ rubyforge_project:
95
+ rubygems_version: 2.5.1
96
+ signing_key:
97
+ specification_version: 4
98
+ summary: crawler for Middleman site
99
+ test_files:
100
+ - features/support/env.rb
101
+ has_rdoc: