middleman-crawler 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 304f53a614e91d793893025e8c7e9e1bd97de1a6
4
+ data.tar.gz: 991befa646235f146eb996f0608786130b62095a
5
+ SHA512:
6
+ metadata.gz: 785c85c945369332430d67a6bd139103eeadcfb6d166baae3aead4b21576247ad1764a8115516d9d4f2dc35df6c78b8d44921ee14f3879f97f3461f73b18ef97
7
+ data.tar.gz: c7e800ab263c448a8526245e5c0e8c3a7c8302c4d584735c77748a99af37f147294fa7e85684307e585e671eeea4dd377c24dbf0897250442f49dfb6dfbb7d7a
data/.gitignore ADDED
@@ -0,0 +1,34 @@
1
+ *.gem
2
+ *.rbc
3
+ /.config
4
+ /coverage/
5
+ /InstalledFiles
6
+ /pkg/
7
+ /spec/reports/
8
+ /spec/examples.txt
9
+ /test/tmp/
10
+ /test/version_tmp/
11
+ /tmp/
12
+
13
+ # Used by dotenv library to load environment variables.
14
+ # .env
15
+
16
+ ## Documentation cache and generated files:
17
+ /.yardoc/
18
+ /_yardoc/
19
+ /doc/
20
+ /rdoc/
21
+
22
+ ## Environment normalization:
23
+ /.bundle/
24
+ /vendor/bundle
25
+ /lib/bundler/man/
26
+
27
+ # for a library or gem, you might want to ignore these files since the code is
28
+ # intended to run in multiple environments; otherwise, check them in:
29
+ Gemfile.lock
30
+ .ruby-version
31
+ .ruby-gemset
32
+
33
+ # unless supporting rvm < 1.11.0 or doing something fancy, ignore this:
34
+ .rvmrc
data/Gemfile ADDED
@@ -0,0 +1,14 @@
1
+ # If you do not have OpenSSL installed, update
2
+ # the following line to use "http://" instead
3
+ source 'https://rubygems.org'
4
+
5
+ gemspec
6
+
7
+ group :development do
8
+ gem 'rake'
9
+ gem 'rdoc'
10
+ gem 'yard'
11
+ end
12
+
13
+ group :test do
14
+ end
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2016 Filippo Gangi Dino
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,61 @@
1
+ # Middleman::Crawler
2
+
3
+ Add a crawler CLI to crawl your middleman site.
4
+
5
+ [![Gem Version](https://badge.fury.io/rb/middleman-crawler.png)](http://badge.fury.io/rb/middleman-crawler)
6
+
7
+ ## Installation
8
+
9
+ Add this line to your application's Gemfile:
10
+
11
+ gem 'middleman-crawler'
12
+
13
+ And then execute:
14
+
15
+ $ bundle
16
+
17
+ Or install it yourself as:
18
+
19
+ $ gem install middleman-crawler
20
+
21
+ ## Usage
22
+
23
+ First you need to activate the module in your `config.rb`.
24
+
25
+ ```
26
+ activate :crawler
27
+ ```
28
+
29
+ After you must declare in your `environments/*.rb` this config var:
30
+
31
+ ```
32
+ config[:base_url] = "http://someurl.here"
33
+ ```
34
+
35
+ This extension use [Rawler Gem](https://github.com/oscardelben/rawler) and implements the same interface with one more option `environment`.
36
+
37
+ In CLI you can write `middleman crawler http://example.com [options]`
38
+
39
+ where [options] are:
40
+ --environment, -e <s>: The Middleman Enviroment
41
+ --username, -u <s>: HTTP Basic Username
42
+ --password, -p <s>: HTTP Basic Password
43
+ --wait, -w <f>: Seconds to wait between requests, may be fractional e.g. '1.5' (default: 3.0)
44
+ --log, -l: Log results to file rawler_log.txt
45
+ --logfile, -o <s>: Specify logfile, implies --log (default: rawler_log.txt)
46
+ --css, -c: Check CSS links
47
+ --skip, -s <s>: Skip URLs that match a regexp
48
+ --iskip, -i <s>: Skip URLs that match a case insensitive regexp
49
+ --include <s>: Only include URLs that match a regexp
50
+ --iinclude <s>: Only include URLs that match a case insensitive regexp
51
+ --local <s>: Restrict to the given URL and below. Equivalent to '--include ^http://mysite.com/*'.
52
+ --ignore_fragments: Strips any fragment from parsed links
53
+
54
+
55
+ ## Contributing
56
+
57
+ 1. Fork it ( https://github.com/welaika/middleman-crawler )
58
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
59
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
60
+ 4. Push to the branch (`git push origin my-new-feature`)
61
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,3 @@
1
+ require 'bundler'
2
+ Bundler.setup
3
+ Bundler::GemHelper.install_tasks
@@ -0,0 +1,4 @@
1
+ PROJECT_ROOT_PATH = File.dirname(File.dirname(File.dirname(__FILE__)))
2
+ require 'middleman-core'
3
+ require 'middleman-core/step_definitions'
4
+ require File.join(PROJECT_ROOT_PATH, 'lib', 'middleman-crawler')
@@ -0,0 +1,8 @@
1
+ require 'middleman-core'
2
+
3
+ require 'middleman-crawler/commands'
4
+
5
+ Middleman::Extensions.register(:crawler) do
6
+ require 'middleman-crawler/extension'
7
+ Crawler
8
+ end
@@ -0,0 +1,169 @@
1
+ require 'middleman-core/cli'
2
+ require 'middleman-core/rack' if Middleman::VERSION.to_i > 3
3
+ require 'middleman-crawler/extension'
4
+
5
+ require 'rawler'
6
+
7
+ module Middleman
8
+ module Cli
9
+ # This class provides a "crawler" command for the middleman CLI.
10
+ class CrawlerIndex < Thor::Group
11
+ include Thor::Actions
12
+
13
+ check_unknown_options!
14
+
15
+ namespace :crawler
16
+
17
+ class_option :environment,
18
+ aliases: '-e',
19
+ type: :string,
20
+ default: ENV['MM_ENV'] || ENV['RACK_ENV'] || 'development',
21
+ desc: 'The environment Middleman will run under'
22
+
23
+ class_option :username,
24
+ aliases: '-u',
25
+ type: :string,
26
+ desc: 'HTTP Basic Username'
27
+
28
+ class_option :password,
29
+ aliases: '-p',
30
+ type: :string,
31
+ desc: 'HTTP Basic Password'
32
+
33
+ class_option :skip,
34
+ aliases: '-s',
35
+ type: :string,
36
+ default: "\#$",
37
+ desc: 'Skip URLs that match a regexp'
38
+
39
+ class_option :wait,
40
+ aliases: '-w',
41
+ type: :numeric,
42
+ default: 0.1,
43
+ desc: "Seconds to wait between requests, may be fractional e.g. '1.5' (default: 3.0)"
44
+
45
+ class_option :css,
46
+ aliases: '-c',
47
+ type: :boolean,
48
+ default: true,
49
+ desc: "Check CSS links"
50
+
51
+ class_option :local,
52
+ type: :boolean,
53
+ default: true,
54
+ desc: "Restrict to the given URL and below. Equivalent to '--include ^http://mysite.com/*'."
55
+
56
+ class_option :log,
57
+ aliases: '-l',
58
+ type: :boolean,
59
+ desc: "Log results to file rawler_log.txt"
60
+
61
+ class_option :logfile,
62
+ aliases: '-o',
63
+ type: :string,
64
+ desc: "Specify logfile, implies --log (default: rawler_log.txt)"
65
+
66
+ class_option :iskip,
67
+ aliases: '-i',
68
+ type: :string,
69
+ desc: "Skip URLs that match a case insensitive regexp"
70
+
71
+ class_option :include,
72
+ type: :string,
73
+ desc: "Only include URLs that match a regexp"
74
+
75
+ class_option :iinclude,
76
+ type: :string,
77
+ desc: "Only include URLs that match a case insensitive regexp"
78
+
79
+ class_option :ignore_fragments,
80
+ type: :boolean,
81
+ desc: "Strips any fragment from parsed links"
82
+
83
+
84
+ # Tell Thor to exit with a nonzero exit code on failure
85
+ def self.exit_on_failure?
86
+ true
87
+ end
88
+
89
+ def crawler
90
+ configs = get_configs
91
+
92
+ env_config = File.join(Middleman::Application.root, 'environments', "#{configs[:environment]}.rb")
93
+ return unless File.exist? env_config
94
+
95
+ configurations = Configure.new
96
+
97
+ configurations.instance_eval File.read(env_config), env_config, 1
98
+
99
+ response = {}
100
+ raw = Rawler::Base.new(configurations.config[:base_url], $stdout, rawler_options)
101
+ raw.validate
102
+ response['errors'] = raw.responses.reject{ |link, response|
103
+ (100..399).include?(response[:status].to_i)
104
+ }
105
+ say("There are some errors: #{errors_message_for(response['errors'])}", :red) if response['errors']
106
+ puts "All crawling actions ok" unless response['errors']
107
+ end
108
+
109
+ private
110
+
111
+ def get_configs
112
+ configs ||= {
113
+ environment: (options['environment'].presence || "development").to_sym,
114
+ username: options['username'],
115
+ password: options['password'],
116
+ skip: options['skip'],
117
+ wait: options['wait'],
118
+ css: options['css'],
119
+ local: options['local'],
120
+ log: options['log'],
121
+ logfile: options['logfile'],
122
+ iskip: options['iskip'],
123
+ include: options['include'],
124
+ iinclude: options['iinclude'],
125
+ ignore_fragments: options['ignore_fragments']
126
+ }
127
+ end
128
+
129
+ def rawler_options
130
+ opts = {}
131
+ opts[:username] = get_configs[:username].to_s if get_configs[:username].present?
132
+ opts[:password] = get_configs[:password].to_s if get_configs[:password].present?
133
+ opts[:skip] = get_configs[:skip].present? ? get_configs[:skip].to_s : "\#$"
134
+ opts[:wait] = get_configs[:wait].present? ? get_configs[:wait].to_f : 0.1
135
+ opts[:css] = get_configs[:css] != nil ? get_configs[:css] : true
136
+ opts[:local] = get_configs[:local] != nil ? get_configs[:local] : true
137
+ opts[:log] = get_configs[:log] if get_configs[:log] != nil
138
+ opts[:logfile] = get_configs[:logfile].to_s if get_configs[:logfile].present?
139
+ opts[:iskip] = get_configs[:iskip].to_s if get_configs[:iskip].present?
140
+ opts[:include] = get_configs[:include].to_s if get_configs[:include].present?
141
+ opts[:iinclude] = get_configs[:iinclude].to_s if get_configs[:iskip].present?
142
+ opts[:ignore_fragments] = get_configs[:ignore_fragments] if get_configs[:ignore_fragments] != nil
143
+ opts
144
+ end
145
+
146
+ def errors_message_for(responses)
147
+ collection = responses.each_with_object([]) do |(url, response), a|
148
+ a << "url: #{url}, error_code: #{response[:status]}"
149
+ end
150
+
151
+ collection.join("; ")
152
+
153
+ end
154
+
155
+ class Configure
156
+
157
+ attr_reader :config
158
+
159
+ def initialize
160
+ @config = {}
161
+ end
162
+ end
163
+
164
+ end
165
+
166
+ # Add to CLI
167
+ Base.register(Middleman::Cli::CrawlerIndex, 'crawler', 'crawler [options]', "Starts a crawler")
168
+ end
169
+ end
@@ -0,0 +1,31 @@
1
+ # Require core library
2
+ require 'middleman-core'
3
+
4
+ # Extension namespace
5
+ class Crawler < ::Middleman::Extension
6
+
7
+ expose_to_template :crawler
8
+
9
+ def initialize(app, options_hash={}, &block)
10
+ # Call super to build options from the options_hash
11
+ super
12
+
13
+ # Require libraries only when activated
14
+ # require 'necessary/library'
15
+
16
+ # set up your extension
17
+ end
18
+
19
+ def after_configuration
20
+ # Do something
21
+ end
22
+
23
+ # A Sitemap Manipulator
24
+ # def manipulate_resource_list(resources)
25
+ # end
26
+
27
+ # helpers do
28
+ # def a_helper
29
+ # end
30
+ # end
31
+ end
@@ -0,0 +1 @@
1
+ require 'middleman-crawler'
@@ -0,0 +1,27 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+
4
+ Gem::Specification.new do |s|
5
+ s.name = "middleman-crawler"
6
+ s.version = "0.0.1"
7
+ s.platform = Gem::Platform::RUBY
8
+ s.authors = ["weLaika", "Filippo Gangi Dino", "Fabrizio Monti"]
9
+ s.email = ["info@welaika.com"]
10
+ s.homepage = "https://github.com/welaika/middleman-crawler"
11
+ s.summary = %q{crawler for Middleman site}
12
+ s.description = %q{it starts a crawler for Middleman sites}
13
+
14
+ s.files = `git ls-files`.split("\n")
15
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
16
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
17
+ s.require_paths = ["lib"]
18
+ s.license = 'MIT'
19
+
20
+ # The version of middleman-core your extension depends on
21
+ s.add_runtime_dependency "middleman-core", [">= 4.1.9"]
22
+
23
+ # Additional dependencies
24
+ s.add_runtime_dependency "rawler", "~> 0.1.8"
25
+
26
+ s.add_dependency 'bundler', '~> 1.3'
27
+ end
metadata ADDED
@@ -0,0 +1,101 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: middleman-crawler
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - weLaika
8
+ - Filippo Gangi Dino
9
+ - Fabrizio Monti
10
+ autorequire:
11
+ bindir: bin
12
+ cert_chain: []
13
+ date: 2016-07-05 00:00:00.000000000 Z
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: middleman-core
17
+ requirement: !ruby/object:Gem::Requirement
18
+ requirements:
19
+ - - ">="
20
+ - !ruby/object:Gem::Version
21
+ version: 4.1.9
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ requirements:
26
+ - - ">="
27
+ - !ruby/object:Gem::Version
28
+ version: 4.1.9
29
+ - !ruby/object:Gem::Dependency
30
+ name: rawler
31
+ requirement: !ruby/object:Gem::Requirement
32
+ requirements:
33
+ - - "~>"
34
+ - !ruby/object:Gem::Version
35
+ version: 0.1.8
36
+ type: :runtime
37
+ prerelease: false
38
+ version_requirements: !ruby/object:Gem::Requirement
39
+ requirements:
40
+ - - "~>"
41
+ - !ruby/object:Gem::Version
42
+ version: 0.1.8
43
+ - !ruby/object:Gem::Dependency
44
+ name: bundler
45
+ requirement: !ruby/object:Gem::Requirement
46
+ requirements:
47
+ - - "~>"
48
+ - !ruby/object:Gem::Version
49
+ version: '1.3'
50
+ type: :runtime
51
+ prerelease: false
52
+ version_requirements: !ruby/object:Gem::Requirement
53
+ requirements:
54
+ - - "~>"
55
+ - !ruby/object:Gem::Version
56
+ version: '1.3'
57
+ description: it starts a crawler for Middleman sites
58
+ email:
59
+ - info@welaika.com
60
+ executables: []
61
+ extensions: []
62
+ extra_rdoc_files: []
63
+ files:
64
+ - ".gitignore"
65
+ - Gemfile
66
+ - LICENSE
67
+ - README.md
68
+ - Rakefile
69
+ - features/support/env.rb
70
+ - lib/middleman-crawler.rb
71
+ - lib/middleman-crawler/commands.rb
72
+ - lib/middleman-crawler/extension.rb
73
+ - lib/middleman-extension.rb
74
+ - middleman-crawler.gemspec
75
+ homepage: https://github.com/welaika/middleman-crawler
76
+ licenses:
77
+ - MIT
78
+ metadata: {}
79
+ post_install_message:
80
+ rdoc_options: []
81
+ require_paths:
82
+ - lib
83
+ required_ruby_version: !ruby/object:Gem::Requirement
84
+ requirements:
85
+ - - ">="
86
+ - !ruby/object:Gem::Version
87
+ version: '0'
88
+ required_rubygems_version: !ruby/object:Gem::Requirement
89
+ requirements:
90
+ - - ">="
91
+ - !ruby/object:Gem::Version
92
+ version: '0'
93
+ requirements: []
94
+ rubyforge_project:
95
+ rubygems_version: 2.5.1
96
+ signing_key:
97
+ specification_version: 4
98
+ summary: crawler for Middleman site
99
+ test_files:
100
+ - features/support/env.rb
101
+ has_rdoc: