flute 0.1.0 → 0.2.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +3 -13
- data/exe/flute +51 -0
- data/flute.gemspec +9 -2
- data/lib/flute/document.rb +28 -0
- data/lib/flute/network.rb +29 -0
- data/lib/flute/request.rb +19 -0
- data/lib/flute/response.rb +22 -0
- data/lib/flute/spider_base.rb +83 -0
- data/lib/flute/version.rb +1 -1
- data/lib/flute.rb +11 -1
- data/lib/template/Gemfile +14 -0
- data/lib/template/Rakefile +21 -0
- data/lib/template/items/base_item.rb +5 -0
- data/lib/template/items/rss_item.rb +17 -0
- data/lib/template/items/web_item.rb +7 -0
- data/lib/template/main.rb +27 -0
- data/lib/template/middlewares/rss_middleware.rb +16 -0
- data/lib/template/spiders/ifanr_spider.rb +35 -0
- metadata +73 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 17b89f3543d87eedc0830aa9e3673d45bb4e391f
|
4
|
+
data.tar.gz: feeaa5ae2b1b59701042f662c670d74b558a1822
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1f97074135702109a1bbbc2f6a1500e1982d4e09c7ee4c1a13c06fcede28e66d7ea1bef78a935625b163a74f479f9c4c2ed2b24ce4aa2ea06b185724f17b4c5b
|
7
|
+
data.tar.gz: ccaec98172277454fd21e79d41b80b95614277f215eae5cd7594d6eb9bff1ebc9c27bb43f8eea047764f815d48b03766318831ba2f86aabb72769ced9d62f414
|
data/README.md
CHANGED
@@ -9,23 +9,13 @@
|
|
9
9
|
|
10
10
|
## Installation
|
11
11
|
|
12
|
-
Add this line to your application's Gemfile:
|
13
|
-
|
14
|
-
```ruby
|
15
|
-
gem 'flute'
|
16
|
-
```
|
17
|
-
|
18
|
-
And then execute:
|
19
|
-
|
20
|
-
$ bundle
|
21
|
-
|
22
|
-
Or install it yourself as:
|
23
|
-
|
24
12
|
$ gem install flute
|
25
13
|
|
14
|
+
之后可以使用 `flute new project_name` 来创建一个项目。
|
15
|
+
|
26
16
|
## Usage
|
27
17
|
|
28
|
-
|
18
|
+
项目包含了一个从爱范儿爬虫例子,可以进行参考,或许会有更多的实际例子发布
|
29
19
|
|
30
20
|
|
31
21
|
## License
|
data/exe/flute
ADDED
@@ -0,0 +1,51 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'thor'
|
4
|
+
require 'fileutils'
|
5
|
+
|
6
|
+
class FluteCLI < Thor
|
7
|
+
desc "new NAME", "create project"
|
8
|
+
def new(name)
|
9
|
+
dirs = [
|
10
|
+
'tmp',
|
11
|
+
'log',
|
12
|
+
'config',
|
13
|
+
'lib',
|
14
|
+
'app',
|
15
|
+
'app/utils',
|
16
|
+
]
|
17
|
+
create_dir(name, dirs)
|
18
|
+
FileUtils.touch "#{name}/tmp/.keep"
|
19
|
+
FileUtils.touch "#{name}/log/.keep"
|
20
|
+
|
21
|
+
copy_template 'Gemfile', "#{name}/Gemfile"
|
22
|
+
copy_template 'main.rb', "#{name}/main.rb"
|
23
|
+
copy_template 'Rakefile', "#{name}/Rakefile"
|
24
|
+
|
25
|
+
FileUtils.cp_r template_path('items'), "#{name}/app/items"
|
26
|
+
FileUtils.cp_r template_path('spiders'), "#{name}/app/spiders"
|
27
|
+
FileUtils.cp_r template_path('middlewares'), "#{name}/app/middlewares"
|
28
|
+
|
29
|
+
puts "cd #{name}"
|
30
|
+
puts "bundle install"
|
31
|
+
puts "rake start"
|
32
|
+
end
|
33
|
+
|
34
|
+
private
|
35
|
+
def create_dir(project_name, dirs)
|
36
|
+
Dir.mkdir(project_name)
|
37
|
+
dirs.each {|dir|
|
38
|
+
Dir.mkdir "#{project_name}/#{dir}"
|
39
|
+
}
|
40
|
+
end
|
41
|
+
|
42
|
+
def copy_template(name, dest)
|
43
|
+
FileUtils.cp template_path(name), dest
|
44
|
+
end
|
45
|
+
|
46
|
+
def template_path(name)
|
47
|
+
File.expand_path("../lib/template/#{name}", File.dirname(__FILE__))
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
FluteCLI.start(ARGV)
|
data/flute.gemspec
CHANGED
@@ -17,11 +17,18 @@ Gem::Specification.new do |spec|
|
|
17
17
|
spec.files = `git ls-files -z`.split("\x0").reject do |f|
|
18
18
|
f.match(%r{^(test|spec|features)/})
|
19
19
|
end
|
20
|
-
|
21
|
-
spec.
|
20
|
+
|
21
|
+
spec.bindir = "exe"
|
22
|
+
spec.executables = ["flute"]
|
22
23
|
spec.require_paths = ["lib"]
|
23
24
|
|
24
25
|
spec.add_development_dependency "bundler", "~> 1.13"
|
25
26
|
spec.add_development_dependency "rake", "~> 10.0"
|
26
27
|
spec.add_development_dependency "rspec", "~> 3.0"
|
28
|
+
|
29
|
+
spec.add_dependency 'activesupport', '~> 5.0.1'
|
30
|
+
spec.add_dependency 'typhoeus', '~> 1.1.2'
|
31
|
+
spec.add_dependency 'nokogiri', '~> 1.7.0'
|
32
|
+
spec.add_dependency 'thor'
|
33
|
+
|
27
34
|
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module Flute
|
2
|
+
class Document < Nokogiri::HTML::Document
|
3
|
+
def og_title
|
4
|
+
og('title')
|
5
|
+
end
|
6
|
+
|
7
|
+
def og_description
|
8
|
+
og('description')
|
9
|
+
end
|
10
|
+
|
11
|
+
def og_image
|
12
|
+
og('image')
|
13
|
+
end
|
14
|
+
|
15
|
+
def og_site_name
|
16
|
+
og('site_name')
|
17
|
+
end
|
18
|
+
|
19
|
+
def description
|
20
|
+
self.css('meta[name="description"]')&.first&.attr('content')&.strip
|
21
|
+
end
|
22
|
+
|
23
|
+
def og(name)
|
24
|
+
self.css("meta[property='og:#{name}']")&.first&.attr('content')
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
module Flute
|
2
|
+
class Network
|
3
|
+
def get(url, params: {}, headers: {})
|
4
|
+
options = make_options params, headers
|
5
|
+
response = Typhoeus.get(url, options)
|
6
|
+
Response.new response
|
7
|
+
end
|
8
|
+
|
9
|
+
def request(url, params: {}, headers: {}, method: :get, body: {})
|
10
|
+
options = make_options params, headers
|
11
|
+
request = Request.new url, options.merge(method: method, body: body)
|
12
|
+
request
|
13
|
+
end
|
14
|
+
|
15
|
+
def make_options(params={}, headers={})
|
16
|
+
options = {
|
17
|
+
headers: headers,
|
18
|
+
params: params,
|
19
|
+
followlocation: true,
|
20
|
+
accept_encoding: 'gzip'
|
21
|
+
}
|
22
|
+
options
|
23
|
+
end
|
24
|
+
|
25
|
+
def create_manager(max_concurrency: 5)
|
26
|
+
Typhoeus::Hydra.new(max_concurrency: max_concurrency)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module Flute
|
2
|
+
class Request < Typhoeus::Request
|
3
|
+
def meta
|
4
|
+
@meta
|
5
|
+
end
|
6
|
+
|
7
|
+
def meta=(item)
|
8
|
+
@meta = item
|
9
|
+
end
|
10
|
+
|
11
|
+
def response_html
|
12
|
+
Document.parse response.body
|
13
|
+
end
|
14
|
+
|
15
|
+
def response_json
|
16
|
+
JSON.parse(response.body).symbolize_keys
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module Flute
|
2
|
+
class Response
|
3
|
+
attr_reader :status, :body
|
4
|
+
def initialize(response)
|
5
|
+
@status = response.code
|
6
|
+
@body = response.body
|
7
|
+
end
|
8
|
+
|
9
|
+
def success?
|
10
|
+
200 <= @status && @status < 300
|
11
|
+
end
|
12
|
+
|
13
|
+
def parse_html
|
14
|
+
@parse_html ||= Document.parse(@body)
|
15
|
+
end
|
16
|
+
|
17
|
+
def parse_json
|
18
|
+
@parse_json ||= JSON.parse(@body).symbolize_keys
|
19
|
+
end
|
20
|
+
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,83 @@
|
|
1
|
+
module Flute
|
2
|
+
class SpiderBase
|
3
|
+
|
4
|
+
def manager_network
|
5
|
+
network
|
6
|
+
end
|
7
|
+
|
8
|
+
def network
|
9
|
+
self.class.opts[:network]
|
10
|
+
end
|
11
|
+
|
12
|
+
def manager
|
13
|
+
@manager
|
14
|
+
end
|
15
|
+
|
16
|
+
def reset_manager
|
17
|
+
@manager = manager_network.create_manager
|
18
|
+
end
|
19
|
+
|
20
|
+
def items
|
21
|
+
[]
|
22
|
+
end
|
23
|
+
|
24
|
+
def before_parse(_items)
|
25
|
+
_items
|
26
|
+
end
|
27
|
+
|
28
|
+
def init_queue(_items)
|
29
|
+
reset_manager
|
30
|
+
_items
|
31
|
+
end
|
32
|
+
|
33
|
+
def parse(_items)
|
34
|
+
_items
|
35
|
+
end
|
36
|
+
|
37
|
+
def run_queue(_items)
|
38
|
+
manager.run if manager.queued_requests.size > 0
|
39
|
+
_items
|
40
|
+
end
|
41
|
+
|
42
|
+
def after_parse(_requests)
|
43
|
+
_requests
|
44
|
+
end
|
45
|
+
|
46
|
+
def pipeline
|
47
|
+
methods = self.class.opts[:middleware].map {|name| method(name)}
|
48
|
+
_items = items
|
49
|
+
methods.each { |middleware|
|
50
|
+
_items = middleware.call(_items)
|
51
|
+
}
|
52
|
+
_items
|
53
|
+
end
|
54
|
+
|
55
|
+
|
56
|
+
class << self
|
57
|
+
|
58
|
+
def opts
|
59
|
+
@opts ||= {
|
60
|
+
middleware: [:before_parse, :init_queue, :parse, :run_queue, :after_parse],
|
61
|
+
network: Network.new
|
62
|
+
}
|
63
|
+
end
|
64
|
+
|
65
|
+
def set_name(value)
|
66
|
+
opts[:name] = value
|
67
|
+
end
|
68
|
+
|
69
|
+
def start_urls(value)
|
70
|
+
opts[:start_urls] = value
|
71
|
+
end
|
72
|
+
|
73
|
+
def middleware(value)
|
74
|
+
opts[:middleware] = value
|
75
|
+
end
|
76
|
+
|
77
|
+
def set_network(value)
|
78
|
+
opts[:network] = value
|
79
|
+
end
|
80
|
+
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
data/lib/flute/version.rb
CHANGED
data/lib/flute.rb
CHANGED
@@ -1,5 +1,15 @@
|
|
1
1
|
require "flute/version"
|
2
2
|
|
3
|
+
require 'active_support/all'
|
4
|
+
require 'nokogiri'
|
5
|
+
require 'typhoeus'
|
6
|
+
|
7
|
+
require 'flute/document'
|
8
|
+
require 'flute/request'
|
9
|
+
require 'flute/response'
|
10
|
+
require 'flute/network'
|
11
|
+
require 'flute/spider_base'
|
12
|
+
|
3
13
|
module Flute
|
4
|
-
|
14
|
+
|
5
15
|
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
require_relative('./main')
|
2
|
+
|
3
|
+
module IRBExtensition
|
4
|
+
def reload!
|
5
|
+
puts 'reloading...'
|
6
|
+
ActiveSupport::Dependencies.clear
|
7
|
+
nil
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
task :console do
|
12
|
+
require 'irb'
|
13
|
+
ARGV.clear
|
14
|
+
|
15
|
+
IRB::ExtendCommandBundle.send :include, IRBExtensition
|
16
|
+
IRB.start
|
17
|
+
end
|
18
|
+
|
19
|
+
task :start do
|
20
|
+
start_ant
|
21
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
class RssItem < BaseItem
|
2
|
+
attribute :title, String
|
3
|
+
attribute :summary, String
|
4
|
+
attribute :created, Integer
|
5
|
+
attribute :content, String
|
6
|
+
attribute :seed_id, Integer
|
7
|
+
attribute :head_name, String
|
8
|
+
attribute :head_icon, String, default: 'link'
|
9
|
+
|
10
|
+
def self.create(rss_item)
|
11
|
+
RssItem.new title: rss_item.title&.squish,
|
12
|
+
url: rss_item.url,
|
13
|
+
summary: rss_item.summary&.squish,
|
14
|
+
content: rss_item.content&.squish,
|
15
|
+
created: rss_item.published.to_i
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
env = ENV['CRAWLER_ENV'] || 'development'
|
2
|
+
AREA = ENV['AREA_ENV'] || :cn
|
3
|
+
|
4
|
+
require 'rubygems'
|
5
|
+
require 'bundler'
|
6
|
+
Bundler.require(:default, env)
|
7
|
+
|
8
|
+
$project_root = Pathname.new File.dirname(__FILE__)
|
9
|
+
|
10
|
+
load_paths = ['utils', 'items', 'middlewares', 'spiders'].map do |dir|
|
11
|
+
$project_root.join('app', dir)
|
12
|
+
end
|
13
|
+
|
14
|
+
ActiveSupport::Dependencies.autoload_paths += load_paths
|
15
|
+
|
16
|
+
def start_ant
|
17
|
+
spiders = Dir[$project_root.join('app', 'spiders', '*.rb')].map do |path|
|
18
|
+
File.basename(path, '.rb').to_s.classify.constantize
|
19
|
+
end
|
20
|
+
spiders.each do |spiders|
|
21
|
+
if env != 'production'
|
22
|
+
puts spiders.new.pipeline.map &:to_h
|
23
|
+
else
|
24
|
+
spiders.new.pipeline
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
module RssMiddleware
|
2
|
+
|
3
|
+
def rss_items
|
4
|
+
@rss_body ||= network.get(self.class.opts[:start_urls].first).body
|
5
|
+
@rss ||= Feedjira::Feed.parse @rss_body
|
6
|
+
@rss_items ||= @rss.entries.select {|item|
|
7
|
+
!item.url.blank?
|
8
|
+
}.map {|item|
|
9
|
+
RssItem.create item
|
10
|
+
}
|
11
|
+
end
|
12
|
+
|
13
|
+
def items
|
14
|
+
rss_items[0..5]
|
15
|
+
end
|
16
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
class IfanrSpider < Flute::SpiderBase
|
2
|
+
include RssMiddleware
|
3
|
+
|
4
|
+
set_name 'ifanr'
|
5
|
+
start_urls ['http://www.ifanr.com/feed']
|
6
|
+
|
7
|
+
def before_parse(items)
|
8
|
+
items.map {|item|
|
9
|
+
WebItem.new url: item.url,
|
10
|
+
title: item.title
|
11
|
+
}
|
12
|
+
end
|
13
|
+
|
14
|
+
def parse(items)
|
15
|
+
items.map do |item|
|
16
|
+
request = network.request(item.url)
|
17
|
+
request.meta = item
|
18
|
+
manager.queue request
|
19
|
+
request
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def after_parse(requests)
|
24
|
+
requests.map {|request|
|
25
|
+
html = request.response_html
|
26
|
+
item = request.meta
|
27
|
+
item.description = html.og_description
|
28
|
+
unless html.og_image.blank?
|
29
|
+
item.image = html.og_image
|
30
|
+
end
|
31
|
+
item
|
32
|
+
}
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: flute
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1
|
4
|
+
version: 0.2.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- 'manjia
|
@@ -54,10 +54,67 @@ dependencies:
|
|
54
54
|
- - "~>"
|
55
55
|
- !ruby/object:Gem::Version
|
56
56
|
version: '3.0'
|
57
|
+
- !ruby/object:Gem::Dependency
|
58
|
+
name: activesupport
|
59
|
+
requirement: !ruby/object:Gem::Requirement
|
60
|
+
requirements:
|
61
|
+
- - "~>"
|
62
|
+
- !ruby/object:Gem::Version
|
63
|
+
version: 5.0.1
|
64
|
+
type: :runtime
|
65
|
+
prerelease: false
|
66
|
+
version_requirements: !ruby/object:Gem::Requirement
|
67
|
+
requirements:
|
68
|
+
- - "~>"
|
69
|
+
- !ruby/object:Gem::Version
|
70
|
+
version: 5.0.1
|
71
|
+
- !ruby/object:Gem::Dependency
|
72
|
+
name: typhoeus
|
73
|
+
requirement: !ruby/object:Gem::Requirement
|
74
|
+
requirements:
|
75
|
+
- - "~>"
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: 1.1.2
|
78
|
+
type: :runtime
|
79
|
+
prerelease: false
|
80
|
+
version_requirements: !ruby/object:Gem::Requirement
|
81
|
+
requirements:
|
82
|
+
- - "~>"
|
83
|
+
- !ruby/object:Gem::Version
|
84
|
+
version: 1.1.2
|
85
|
+
- !ruby/object:Gem::Dependency
|
86
|
+
name: nokogiri
|
87
|
+
requirement: !ruby/object:Gem::Requirement
|
88
|
+
requirements:
|
89
|
+
- - "~>"
|
90
|
+
- !ruby/object:Gem::Version
|
91
|
+
version: 1.7.0
|
92
|
+
type: :runtime
|
93
|
+
prerelease: false
|
94
|
+
version_requirements: !ruby/object:Gem::Requirement
|
95
|
+
requirements:
|
96
|
+
- - "~>"
|
97
|
+
- !ruby/object:Gem::Version
|
98
|
+
version: 1.7.0
|
99
|
+
- !ruby/object:Gem::Dependency
|
100
|
+
name: thor
|
101
|
+
requirement: !ruby/object:Gem::Requirement
|
102
|
+
requirements:
|
103
|
+
- - ">="
|
104
|
+
- !ruby/object:Gem::Version
|
105
|
+
version: '0'
|
106
|
+
type: :runtime
|
107
|
+
prerelease: false
|
108
|
+
version_requirements: !ruby/object:Gem::Requirement
|
109
|
+
requirements:
|
110
|
+
- - ">="
|
111
|
+
- !ruby/object:Gem::Version
|
112
|
+
version: '0'
|
57
113
|
description: 一个简单的网络爬虫框架.
|
58
114
|
email:
|
59
115
|
- tywf91@gmail.com
|
60
|
-
executables:
|
116
|
+
executables:
|
117
|
+
- flute
|
61
118
|
extensions: []
|
62
119
|
extra_rdoc_files: []
|
63
120
|
files:
|
@@ -71,9 +128,23 @@ files:
|
|
71
128
|
- Rakefile
|
72
129
|
- bin/console
|
73
130
|
- bin/setup
|
131
|
+
- exe/flute
|
74
132
|
- flute.gemspec
|
75
133
|
- lib/flute.rb
|
134
|
+
- lib/flute/document.rb
|
135
|
+
- lib/flute/network.rb
|
136
|
+
- lib/flute/request.rb
|
137
|
+
- lib/flute/response.rb
|
138
|
+
- lib/flute/spider_base.rb
|
76
139
|
- lib/flute/version.rb
|
140
|
+
- lib/template/Gemfile
|
141
|
+
- lib/template/Rakefile
|
142
|
+
- lib/template/items/base_item.rb
|
143
|
+
- lib/template/items/rss_item.rb
|
144
|
+
- lib/template/items/web_item.rb
|
145
|
+
- lib/template/main.rb
|
146
|
+
- lib/template/middlewares/rss_middleware.rb
|
147
|
+
- lib/template/spiders/ifanr_spider.rb
|
77
148
|
homepage: https://github.com/oxoooo/flute
|
78
149
|
licenses:
|
79
150
|
- MIT
|