juknife 0.1.3 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.codeclimate.yml +20 -0
- data/.gitignore +3 -0
- data/README.md +3 -3
- data/circle.yml +3 -0
- data/juknife.gemspec +4 -1
- data/lib/juknife.rb +8 -3
- data/lib/juknife/agent.rb +44 -0
- data/lib/juknife/error.rb +12 -0
- data/lib/juknife/middleware.rb +11 -0
- data/lib/juknife/middleware/html_parser.rb +16 -0
- data/lib/juknife/middleware/http_error_handler.rb +20 -0
- data/lib/juknife/middleware/request_dsl.rb +37 -0
- data/lib/juknife/middleware/scraping_dsl.rb +29 -0
- data/lib/juknife/request.rb +4 -2
- data/lib/juknife/scraping.rb +5 -3
- data/lib/juknife/version.rb +1 -1
- metadata +55 -6
- data/.travis.yml +0 -8
- data/lib/juknife/scraper.rb +0 -28
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ad09b3333ab9843a6d35109af520c0ae6d15bd45
|
4
|
+
data.tar.gz: 54a7bdd6f233f1832d9749cccad7067bfa1189a8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 857e0b2836c4ae7299ddd8c7b1f86d430bf4c3df34dae6d92a32927cdc315e35d877e86e6349979beb36a2d1b116d0b447639fe29d08360e28ca687bcf9fc3c6
|
7
|
+
data.tar.gz: f6a53f1e907d93b53cd074a6d46855b4aede515385fa96ec5588bab82153373cf6969fce8850cbb0d6147d420c71d58e0eb972684d73c93f234cd50e12ee77eb
|
data/.codeclimate.yml
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
engines:
|
2
|
+
rubocop:
|
3
|
+
enabled: true
|
4
|
+
channel: rubocop-0-48
|
5
|
+
duplication:
|
6
|
+
enabled: true
|
7
|
+
config:
|
8
|
+
languages:
|
9
|
+
- ruby
|
10
|
+
fixme:
|
11
|
+
enabled: true
|
12
|
+
|
13
|
+
ratings:
|
14
|
+
paths:
|
15
|
+
- Gemfile.lock
|
16
|
+
- "**.rb"
|
17
|
+
|
18
|
+
exclude_paths:
|
19
|
+
- spec/
|
20
|
+
- vendor/
|
data/.gitignore
CHANGED
data/README.md
CHANGED
@@ -7,7 +7,7 @@ A Ruby library for Web scraping. It allows to describe scraping nodes and text s
|
|
7
7
|
For example, scraping on Google search can be written as follows:
|
8
8
|
|
9
9
|
```ruby
|
10
|
-
class
|
10
|
+
class GoogleSearchAgent < Juknife::Agent
|
11
11
|
request do
|
12
12
|
get 'https://www.google.co.jp/search'
|
13
13
|
user_agent 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)'
|
@@ -27,8 +27,8 @@ class GoogleSearchScraper < Juknife::Scraper
|
|
27
27
|
end
|
28
28
|
end
|
29
29
|
|
30
|
-
|
31
|
-
|
30
|
+
agent = GoogleSearchAgent.new
|
31
|
+
agent.scrape(query: 'test')
|
32
32
|
# =>
|
33
33
|
# {:results=>
|
34
34
|
# [
|
data/circle.yml
ADDED
data/juknife.gemspec
CHANGED
@@ -29,10 +29,13 @@ Gem::Specification.new do |spec|
|
|
29
29
|
spec.add_runtime_dependency 'activesupport', '~> 5.0.2'
|
30
30
|
spec.add_runtime_dependency 'nokogiri', '~> 1.7'
|
31
31
|
spec.add_runtime_dependency 'faraday', '~> 0.1'
|
32
|
+
spec.add_runtime_dependency 'faraday_middleware', '~> 0.11'
|
32
33
|
spec.add_development_dependency 'bundler', '~> 1.14'
|
34
|
+
spec.add_development_dependency 'codeclimate-test-reporter'
|
35
|
+
spec.add_development_dependency 'pry'
|
33
36
|
spec.add_development_dependency 'rake', '~> 10.0'
|
34
37
|
spec.add_development_dependency 'rspec', '~> 3.0'
|
35
38
|
spec.add_development_dependency 'rubocop'
|
39
|
+
spec.add_development_dependency 'simplecov'
|
36
40
|
spec.add_development_dependency 'yard'
|
37
|
-
spec.add_development_dependency 'pry'
|
38
41
|
end
|
data/lib/juknife.rb
CHANGED
@@ -1,10 +1,15 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
require 'juknife/version'
|
4
|
+
require 'active_support/dependencies/autoload'
|
4
5
|
|
5
6
|
# The module that contains everything Juknife-related.
|
6
7
|
module Juknife
|
7
|
-
|
8
|
-
|
9
|
-
autoload :
|
8
|
+
extend ActiveSupport::Autoload
|
9
|
+
|
10
|
+
autoload :Agent
|
11
|
+
autoload :Error
|
12
|
+
autoload :Middleware
|
13
|
+
autoload :Request
|
14
|
+
autoload :Scraping
|
10
15
|
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'faraday_middleware'
|
4
|
+
|
5
|
+
module Juknife
|
6
|
+
# The agent to execute request and scraping to web page.
|
7
|
+
class Agent
|
8
|
+
def scrape(params = {})
|
9
|
+
response(params).body
|
10
|
+
end
|
11
|
+
|
12
|
+
def response(params)
|
13
|
+
connection(params).get
|
14
|
+
end
|
15
|
+
|
16
|
+
def connection(params)
|
17
|
+
@connection ||= Faraday.new do |b|
|
18
|
+
# request middlewares
|
19
|
+
b.use Middleware::RequestDSL, params, &self.class.request_dsl
|
20
|
+
b.use Faraday::Request::UrlEncoded
|
21
|
+
|
22
|
+
# response middlewares
|
23
|
+
b.use Middleware::ScrapingDSL, params, &self.class.scraping_dsl
|
24
|
+
b.use Middleware::HTMLParser
|
25
|
+
b.use Middleware::HTTPErrorHandler
|
26
|
+
b.use FaradayMiddleware::FollowRedirects
|
27
|
+
|
28
|
+
b.adapter :net_http
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
class << self
|
33
|
+
attr_reader :request_dsl, :scraping_dsl
|
34
|
+
|
35
|
+
def request(&block)
|
36
|
+
@request_dsl = block
|
37
|
+
end
|
38
|
+
|
39
|
+
def scraping(&block)
|
40
|
+
@scraping_dsl = block
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
@@ -0,0 +1,11 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Juknife
|
4
|
+
# :nodoc:
|
5
|
+
module Middleware
|
6
|
+
autoload :HTMLParser, 'juknife/middleware/html_parser'
|
7
|
+
autoload :HTTPErrorHandler, 'juknife/middleware/http_error_handler'
|
8
|
+
autoload :RequestDSL, 'juknife/middleware/request_dsl'
|
9
|
+
autoload :ScrapingDSL, 'juknife/middleware/scraping_dsl'
|
10
|
+
end
|
11
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'faraday'
|
4
|
+
require 'nokogiri'
|
5
|
+
|
6
|
+
module Juknife
|
7
|
+
# :nodoc:
|
8
|
+
module Middleware
|
9
|
+
# A Fraday middleware to parse body string to Nokogiri document.
|
10
|
+
class HTMLParser < Faraday::Response::Middleware
|
11
|
+
def parse(body)
|
12
|
+
Nokogiri.parse(body)
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'faraday'
|
4
|
+
|
5
|
+
module Juknife
|
6
|
+
# :nodoc:
|
7
|
+
module Middleware
|
8
|
+
# A middleware to handle HTTP errors
|
9
|
+
class HTTPErrorHandler < Faraday::Response::Middleware
|
10
|
+
def on_complete(env)
|
11
|
+
case env[:status]
|
12
|
+
when 400..499
|
13
|
+
raise HTTPClientError, env[:status]
|
14
|
+
when 500..599
|
15
|
+
raise HTTPServerError, env[:status]
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'faraday'
|
4
|
+
|
5
|
+
module Juknife
|
6
|
+
# :nodoc:
|
7
|
+
module Middleware
|
8
|
+
# A Faraday middleware to interpret Juknife request DSL
|
9
|
+
class RequestDSL < Faraday::Middleware
|
10
|
+
include Juknife::Request::DSL
|
11
|
+
|
12
|
+
attr_reader :params
|
13
|
+
|
14
|
+
def initialize(app, params = {}, &block)
|
15
|
+
super(app)
|
16
|
+
@params = params
|
17
|
+
instance_eval(&block)
|
18
|
+
end
|
19
|
+
|
20
|
+
def call(env) # rubocop: disable Metrics/AbcSize
|
21
|
+
uri = URI.parse(url_builder.call)
|
22
|
+
uri.query = query_builder&.call&.to_query
|
23
|
+
|
24
|
+
env[:url] = uri
|
25
|
+
env[:method] = http_method
|
26
|
+
env[:body] = body_builder&.call
|
27
|
+
env[:request_headers] ||= {}
|
28
|
+
|
29
|
+
if user_agent_builder
|
30
|
+
env[:request_headers]['User-Agent'] = user_agent_builder&.call
|
31
|
+
end
|
32
|
+
|
33
|
+
@app.call(env)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'faraday'
|
4
|
+
|
5
|
+
module Juknife
|
6
|
+
# :nodoc:
|
7
|
+
module Middleware
|
8
|
+
# A Faraday middleware to interpret Juknife scraping DSL
|
9
|
+
class ScrapingDSL < Faraday::Response::Middleware
|
10
|
+
include Juknife::Scraping::DSL
|
11
|
+
|
12
|
+
def initialize(app, params = {}, &block)
|
13
|
+
super(app)
|
14
|
+
@params = params
|
15
|
+
instance_eval(&block)
|
16
|
+
end
|
17
|
+
|
18
|
+
def parse(doc)
|
19
|
+
context = Juknife::Scraping::Context.new(doc)
|
20
|
+
|
21
|
+
children.each do |child|
|
22
|
+
child.visit(context)
|
23
|
+
end
|
24
|
+
|
25
|
+
context.result
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
data/lib/juknife/request.rb
CHANGED
data/lib/juknife/scraping.rb
CHANGED
@@ -3,8 +3,10 @@
|
|
3
3
|
module Juknife
|
4
4
|
# :nodoc:
|
5
5
|
module Scraping
|
6
|
-
|
7
|
-
|
8
|
-
autoload :
|
6
|
+
extend ActiveSupport::Autoload
|
7
|
+
|
8
|
+
autoload :DSL
|
9
|
+
autoload :Context
|
10
|
+
autoload :Runner
|
9
11
|
end
|
10
12
|
end
|
data/lib/juknife/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: juknife
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- nyamadori
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-05-
|
11
|
+
date: 2017-05-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activesupport
|
@@ -52,6 +52,20 @@ dependencies:
|
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '0.1'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: faraday_middleware
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0.11'
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0.11'
|
55
69
|
- !ruby/object:Gem::Dependency
|
56
70
|
name: bundler
|
57
71
|
requirement: !ruby/object:Gem::Requirement
|
@@ -66,6 +80,34 @@ dependencies:
|
|
66
80
|
- - "~>"
|
67
81
|
- !ruby/object:Gem::Version
|
68
82
|
version: '1.14'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: codeclimate-test-reporter
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - ">="
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: pry
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - ">="
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
type: :development
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - ">="
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0'
|
69
111
|
- !ruby/object:Gem::Dependency
|
70
112
|
name: rake
|
71
113
|
requirement: !ruby/object:Gem::Requirement
|
@@ -109,7 +151,7 @@ dependencies:
|
|
109
151
|
- !ruby/object:Gem::Version
|
110
152
|
version: '0'
|
111
153
|
- !ruby/object:Gem::Dependency
|
112
|
-
name:
|
154
|
+
name: simplecov
|
113
155
|
requirement: !ruby/object:Gem::Requirement
|
114
156
|
requirements:
|
115
157
|
- - ">="
|
@@ -123,7 +165,7 @@ dependencies:
|
|
123
165
|
- !ruby/object:Gem::Version
|
124
166
|
version: '0'
|
125
167
|
- !ruby/object:Gem::Dependency
|
126
|
-
name:
|
168
|
+
name: yard
|
127
169
|
requirement: !ruby/object:Gem::Requirement
|
128
170
|
requirements:
|
129
171
|
- - ">="
|
@@ -145,10 +187,10 @@ executables: []
|
|
145
187
|
extensions: []
|
146
188
|
extra_rdoc_files: []
|
147
189
|
files:
|
190
|
+
- ".codeclimate.yml"
|
148
191
|
- ".gitignore"
|
149
192
|
- ".rspec"
|
150
193
|
- ".rubocop.yml"
|
151
|
-
- ".travis.yml"
|
152
194
|
- ".yardopts"
|
153
195
|
- CODE_OF_CONDUCT.md
|
154
196
|
- Gemfile
|
@@ -157,12 +199,19 @@ files:
|
|
157
199
|
- Rakefile
|
158
200
|
- bin/console
|
159
201
|
- bin/setup
|
202
|
+
- circle.yml
|
160
203
|
- juknife.gemspec
|
161
204
|
- lib/juknife.rb
|
205
|
+
- lib/juknife/agent.rb
|
206
|
+
- lib/juknife/error.rb
|
207
|
+
- lib/juknife/middleware.rb
|
208
|
+
- lib/juknife/middleware/html_parser.rb
|
209
|
+
- lib/juknife/middleware/http_error_handler.rb
|
210
|
+
- lib/juknife/middleware/request_dsl.rb
|
211
|
+
- lib/juknife/middleware/scraping_dsl.rb
|
162
212
|
- lib/juknife/request.rb
|
163
213
|
- lib/juknife/request/dsl.rb
|
164
214
|
- lib/juknife/request/runner.rb
|
165
|
-
- lib/juknife/scraper.rb
|
166
215
|
- lib/juknife/scraping.rb
|
167
216
|
- lib/juknife/scraping/context.rb
|
168
217
|
- lib/juknife/scraping/dsl.rb
|
data/.travis.yml
DELETED
data/lib/juknife/scraper.rb
DELETED
@@ -1,28 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
module Juknife
|
4
|
-
# Juknife::Scraper is a scraper to request a web page and to extract data.
|
5
|
-
class Scraper
|
6
|
-
def initialize
|
7
|
-
@scraping = Scraping::Runner.new(&self.class.scraping_block)
|
8
|
-
@request = Request::Runner.new(&self.class.request_block)
|
9
|
-
end
|
10
|
-
|
11
|
-
def scrape(params = {})
|
12
|
-
source = @request.run(params)
|
13
|
-
@scraping.run(source, params)
|
14
|
-
end
|
15
|
-
|
16
|
-
class << self
|
17
|
-
attr_reader :scraping_block, :request_block
|
18
|
-
|
19
|
-
def scraping(&block)
|
20
|
-
@scraping_block = block
|
21
|
-
end
|
22
|
-
|
23
|
-
def request(&block)
|
24
|
-
@request_block = block
|
25
|
-
end
|
26
|
-
end
|
27
|
-
end
|
28
|
-
end
|