flute 0.1.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +3 -13
- data/exe/flute +51 -0
- data/flute.gemspec +9 -2
- data/lib/flute/document.rb +28 -0
- data/lib/flute/network.rb +29 -0
- data/lib/flute/request.rb +19 -0
- data/lib/flute/response.rb +22 -0
- data/lib/flute/spider_base.rb +83 -0
- data/lib/flute/version.rb +1 -1
- data/lib/flute.rb +11 -1
- data/lib/template/Gemfile +14 -0
- data/lib/template/Rakefile +21 -0
- data/lib/template/items/base_item.rb +5 -0
- data/lib/template/items/rss_item.rb +17 -0
- data/lib/template/items/web_item.rb +7 -0
- data/lib/template/main.rb +27 -0
- data/lib/template/middlewares/rss_middleware.rb +16 -0
- data/lib/template/spiders/ifanr_spider.rb +35 -0
- metadata +73 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 17b89f3543d87eedc0830aa9e3673d45bb4e391f
|
4
|
+
data.tar.gz: feeaa5ae2b1b59701042f662c670d74b558a1822
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1f97074135702109a1bbbc2f6a1500e1982d4e09c7ee4c1a13c06fcede28e66d7ea1bef78a935625b163a74f479f9c4c2ed2b24ce4aa2ea06b185724f17b4c5b
|
7
|
+
data.tar.gz: ccaec98172277454fd21e79d41b80b95614277f215eae5cd7594d6eb9bff1ebc9c27bb43f8eea047764f815d48b03766318831ba2f86aabb72769ced9d62f414
|
data/README.md
CHANGED
@@ -9,23 +9,13 @@
|
|
9
9
|
|
10
10
|
## Installation
|
11
11
|
|
12
|
-
Add this line to your application's Gemfile:
|
13
|
-
|
14
|
-
```ruby
|
15
|
-
gem 'flute'
|
16
|
-
```
|
17
|
-
|
18
|
-
And then execute:
|
19
|
-
|
20
|
-
$ bundle
|
21
|
-
|
22
|
-
Or install it yourself as:
|
23
|
-
|
24
12
|
$ gem install flute
|
25
13
|
|
14
|
+
之后可以使用 `flute new project_name` 来创建一个项目。
|
15
|
+
|
26
16
|
## Usage
|
27
17
|
|
28
|
-
|
18
|
+
项目包含了一个从爱范儿爬虫例子,可以进行参考,或许会有更多的实际例子发布
|
29
19
|
|
30
20
|
|
31
21
|
## License
|
data/exe/flute
ADDED
@@ -0,0 +1,51 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'thor'
|
4
|
+
require 'fileutils'
|
5
|
+
|
6
|
+
class FluteCLI < Thor
|
7
|
+
desc "new NAME", "create project"
|
8
|
+
def new(name)
|
9
|
+
dirs = [
|
10
|
+
'tmp',
|
11
|
+
'log',
|
12
|
+
'config',
|
13
|
+
'lib',
|
14
|
+
'app',
|
15
|
+
'app/utils',
|
16
|
+
]
|
17
|
+
create_dir(name, dirs)
|
18
|
+
FileUtils.touch "#{name}/tmp/.keep"
|
19
|
+
FileUtils.touch "#{name}/log/.keep"
|
20
|
+
|
21
|
+
copy_template 'Gemfile', "#{name}/Gemfile"
|
22
|
+
copy_template 'main.rb', "#{name}/main.rb"
|
23
|
+
copy_template 'Rakefile', "#{name}/Rakefile"
|
24
|
+
|
25
|
+
FileUtils.cp_r template_path('items'), "#{name}/app/items"
|
26
|
+
FileUtils.cp_r template_path('spiders'), "#{name}/app/spiders"
|
27
|
+
FileUtils.cp_r template_path('middlewares'), "#{name}/app/middlewares"
|
28
|
+
|
29
|
+
puts "cd #{name}"
|
30
|
+
puts "bundle install"
|
31
|
+
puts "rake start"
|
32
|
+
end
|
33
|
+
|
34
|
+
private
|
35
|
+
def create_dir(project_name, dirs)
|
36
|
+
Dir.mkdir(project_name)
|
37
|
+
dirs.each {|dir|
|
38
|
+
Dir.mkdir "#{project_name}/#{dir}"
|
39
|
+
}
|
40
|
+
end
|
41
|
+
|
42
|
+
def copy_template(name, dest)
|
43
|
+
FileUtils.cp template_path(name), dest
|
44
|
+
end
|
45
|
+
|
46
|
+
def template_path(name)
|
47
|
+
File.expand_path("../lib/template/#{name}", File.dirname(__FILE__))
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
FluteCLI.start(ARGV)
|
data/flute.gemspec
CHANGED
@@ -17,11 +17,18 @@ Gem::Specification.new do |spec|
|
|
17
17
|
spec.files = `git ls-files -z`.split("\x0").reject do |f|
|
18
18
|
f.match(%r{^(test|spec|features)/})
|
19
19
|
end
|
20
|
-
|
21
|
-
spec.
|
20
|
+
|
21
|
+
spec.bindir = "exe"
|
22
|
+
spec.executables = ["flute"]
|
22
23
|
spec.require_paths = ["lib"]
|
23
24
|
|
24
25
|
spec.add_development_dependency "bundler", "~> 1.13"
|
25
26
|
spec.add_development_dependency "rake", "~> 10.0"
|
26
27
|
spec.add_development_dependency "rspec", "~> 3.0"
|
28
|
+
|
29
|
+
spec.add_dependency 'activesupport', '~> 5.0.1'
|
30
|
+
spec.add_dependency 'typhoeus', '~> 1.1.2'
|
31
|
+
spec.add_dependency 'nokogiri', '~> 1.7.0'
|
32
|
+
spec.add_dependency 'thor'
|
33
|
+
|
27
34
|
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module Flute
|
2
|
+
class Document < Nokogiri::HTML::Document
|
3
|
+
def og_title
|
4
|
+
og('title')
|
5
|
+
end
|
6
|
+
|
7
|
+
def og_description
|
8
|
+
og('description')
|
9
|
+
end
|
10
|
+
|
11
|
+
def og_image
|
12
|
+
og('image')
|
13
|
+
end
|
14
|
+
|
15
|
+
def og_site_name
|
16
|
+
og('site_name')
|
17
|
+
end
|
18
|
+
|
19
|
+
def description
|
20
|
+
self.css('meta[name="description"]')&.first&.attr('content')&.strip
|
21
|
+
end
|
22
|
+
|
23
|
+
def og(name)
|
24
|
+
self.css("meta[property='og:#{name}']")&.first&.attr('content')
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
module Flute
|
2
|
+
class Network
|
3
|
+
def get(url, params: {}, headers: {})
|
4
|
+
options = make_options params, headers
|
5
|
+
response = Typhoeus.get(url, options)
|
6
|
+
Response.new response
|
7
|
+
end
|
8
|
+
|
9
|
+
def request(url, params: {}, headers: {}, method: :get, body: {})
|
10
|
+
options = make_options params, headers
|
11
|
+
request = Request.new url, options.merge(method: method, body: body)
|
12
|
+
request
|
13
|
+
end
|
14
|
+
|
15
|
+
def make_options(params={}, headers={})
|
16
|
+
options = {
|
17
|
+
headers: headers,
|
18
|
+
params: params,
|
19
|
+
followlocation: true,
|
20
|
+
accept_encoding: 'gzip'
|
21
|
+
}
|
22
|
+
options
|
23
|
+
end
|
24
|
+
|
25
|
+
def create_manager(max_concurrency: 5)
|
26
|
+
Typhoeus::Hydra.new(max_concurrency: max_concurrency)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module Flute
|
2
|
+
class Request < Typhoeus::Request
|
3
|
+
def meta
|
4
|
+
@meta
|
5
|
+
end
|
6
|
+
|
7
|
+
def meta=(item)
|
8
|
+
@meta = item
|
9
|
+
end
|
10
|
+
|
11
|
+
def response_html
|
12
|
+
Document.parse response.body
|
13
|
+
end
|
14
|
+
|
15
|
+
def response_json
|
16
|
+
JSON.parse(response.body).symbolize_keys
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module Flute
|
2
|
+
class Response
|
3
|
+
attr_reader :status, :body
|
4
|
+
def initialize(response)
|
5
|
+
@status = response.code
|
6
|
+
@body = response.body
|
7
|
+
end
|
8
|
+
|
9
|
+
def success?
|
10
|
+
200 <= @status && @status < 300
|
11
|
+
end
|
12
|
+
|
13
|
+
def parse_html
|
14
|
+
@parse_html ||= Document.parse(@body)
|
15
|
+
end
|
16
|
+
|
17
|
+
def parse_json
|
18
|
+
@parse_json ||= JSON.parse(@body).symbolize_keys
|
19
|
+
end
|
20
|
+
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,83 @@
|
|
1
|
+
module Flute
|
2
|
+
class SpiderBase
|
3
|
+
|
4
|
+
def manager_network
|
5
|
+
network
|
6
|
+
end
|
7
|
+
|
8
|
+
def network
|
9
|
+
self.class.opts[:network]
|
10
|
+
end
|
11
|
+
|
12
|
+
def manager
|
13
|
+
@manager
|
14
|
+
end
|
15
|
+
|
16
|
+
def reset_manager
|
17
|
+
@manager = manager_network.create_manager
|
18
|
+
end
|
19
|
+
|
20
|
+
def items
|
21
|
+
[]
|
22
|
+
end
|
23
|
+
|
24
|
+
def before_parse(_items)
|
25
|
+
_items
|
26
|
+
end
|
27
|
+
|
28
|
+
def init_queue(_items)
|
29
|
+
reset_manager
|
30
|
+
_items
|
31
|
+
end
|
32
|
+
|
33
|
+
def parse(_items)
|
34
|
+
_items
|
35
|
+
end
|
36
|
+
|
37
|
+
def run_queue(_items)
|
38
|
+
manager.run if manager.queued_requests.size > 0
|
39
|
+
_items
|
40
|
+
end
|
41
|
+
|
42
|
+
def after_parse(_requests)
|
43
|
+
_requests
|
44
|
+
end
|
45
|
+
|
46
|
+
def pipeline
|
47
|
+
methods = self.class.opts[:middleware].map {|name| method(name)}
|
48
|
+
_items = items
|
49
|
+
methods.each { |middleware|
|
50
|
+
_items = middleware.call(_items)
|
51
|
+
}
|
52
|
+
_items
|
53
|
+
end
|
54
|
+
|
55
|
+
|
56
|
+
class << self
|
57
|
+
|
58
|
+
def opts
|
59
|
+
@opts ||= {
|
60
|
+
middleware: [:before_parse, :init_queue, :parse, :run_queue, :after_parse],
|
61
|
+
network: Network.new
|
62
|
+
}
|
63
|
+
end
|
64
|
+
|
65
|
+
def set_name(value)
|
66
|
+
opts[:name] = value
|
67
|
+
end
|
68
|
+
|
69
|
+
def start_urls(value)
|
70
|
+
opts[:start_urls] = value
|
71
|
+
end
|
72
|
+
|
73
|
+
def middleware(value)
|
74
|
+
opts[:middleware] = value
|
75
|
+
end
|
76
|
+
|
77
|
+
def set_network(value)
|
78
|
+
opts[:network] = value
|
79
|
+
end
|
80
|
+
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
data/lib/flute/version.rb
CHANGED
data/lib/flute.rb
CHANGED
@@ -1,5 +1,15 @@
|
|
1
1
|
require "flute/version"
|
2
2
|
|
3
|
+
require 'active_support/all'
|
4
|
+
require 'nokogiri'
|
5
|
+
require 'typhoeus'
|
6
|
+
|
7
|
+
require 'flute/document'
|
8
|
+
require 'flute/request'
|
9
|
+
require 'flute/response'
|
10
|
+
require 'flute/network'
|
11
|
+
require 'flute/spider_base'
|
12
|
+
|
3
13
|
module Flute
|
4
|
-
|
14
|
+
|
5
15
|
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
require_relative('./main')
|
2
|
+
|
3
|
+
module IRBExtensition
|
4
|
+
def reload!
|
5
|
+
puts 'reloading...'
|
6
|
+
ActiveSupport::Dependencies.clear
|
7
|
+
nil
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
task :console do
|
12
|
+
require 'irb'
|
13
|
+
ARGV.clear
|
14
|
+
|
15
|
+
IRB::ExtendCommandBundle.send :include, IRBExtensition
|
16
|
+
IRB.start
|
17
|
+
end
|
18
|
+
|
19
|
+
task :start do
|
20
|
+
start_ant
|
21
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
class RssItem < BaseItem
|
2
|
+
attribute :title, String
|
3
|
+
attribute :summary, String
|
4
|
+
attribute :created, Integer
|
5
|
+
attribute :content, String
|
6
|
+
attribute :seed_id, Integer
|
7
|
+
attribute :head_name, String
|
8
|
+
attribute :head_icon, String, default: 'link'
|
9
|
+
|
10
|
+
def self.create(rss_item)
|
11
|
+
RssItem.new title: rss_item.title&.squish,
|
12
|
+
url: rss_item.url,
|
13
|
+
summary: rss_item.summary&.squish,
|
14
|
+
content: rss_item.content&.squish,
|
15
|
+
created: rss_item.published.to_i
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
env = ENV['CRAWLER_ENV'] || 'development'
|
2
|
+
AREA = ENV['AREA_ENV'] || :cn
|
3
|
+
|
4
|
+
require 'rubygems'
|
5
|
+
require 'bundler'
|
6
|
+
Bundler.require(:default, env)
|
7
|
+
|
8
|
+
$project_root = Pathname.new File.dirname(__FILE__)
|
9
|
+
|
10
|
+
load_paths = ['utils', 'items', 'middlewares', 'spiders'].map do |dir|
|
11
|
+
$project_root.join('app', dir)
|
12
|
+
end
|
13
|
+
|
14
|
+
ActiveSupport::Dependencies.autoload_paths += load_paths
|
15
|
+
|
16
|
+
def start_ant
|
17
|
+
spiders = Dir[$project_root.join('app', 'spiders', '*.rb')].map do |path|
|
18
|
+
File.basename(path, '.rb').to_s.classify.constantize
|
19
|
+
end
|
20
|
+
spiders.each do |spiders|
|
21
|
+
if env != 'production'
|
22
|
+
puts spiders.new.pipeline.map &:to_h
|
23
|
+
else
|
24
|
+
spiders.new.pipeline
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
module RssMiddleware
|
2
|
+
|
3
|
+
def rss_items
|
4
|
+
@rss_body ||= network.get(self.class.opts[:start_urls].first).body
|
5
|
+
@rss ||= Feedjira::Feed.parse @rss_body
|
6
|
+
@rss_items ||= @rss.entries.select {|item|
|
7
|
+
!item.url.blank?
|
8
|
+
}.map {|item|
|
9
|
+
RssItem.create item
|
10
|
+
}
|
11
|
+
end
|
12
|
+
|
13
|
+
def items
|
14
|
+
rss_items[0..5]
|
15
|
+
end
|
16
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
class IfanrSpider < Flute::SpiderBase
|
2
|
+
include RssMiddleware
|
3
|
+
|
4
|
+
set_name 'ifanr'
|
5
|
+
start_urls ['http://www.ifanr.com/feed']
|
6
|
+
|
7
|
+
def before_parse(items)
|
8
|
+
items.map {|item|
|
9
|
+
WebItem.new url: item.url,
|
10
|
+
title: item.title
|
11
|
+
}
|
12
|
+
end
|
13
|
+
|
14
|
+
def parse(items)
|
15
|
+
items.map do |item|
|
16
|
+
request = network.request(item.url)
|
17
|
+
request.meta = item
|
18
|
+
manager.queue request
|
19
|
+
request
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def after_parse(requests)
|
24
|
+
requests.map {|request|
|
25
|
+
html = request.response_html
|
26
|
+
item = request.meta
|
27
|
+
item.description = html.og_description
|
28
|
+
unless html.og_image.blank?
|
29
|
+
item.image = html.og_image
|
30
|
+
end
|
31
|
+
item
|
32
|
+
}
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: flute
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1
|
4
|
+
version: 0.2.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- 'manjia
|
@@ -54,10 +54,67 @@ dependencies:
|
|
54
54
|
- - "~>"
|
55
55
|
- !ruby/object:Gem::Version
|
56
56
|
version: '3.0'
|
57
|
+
- !ruby/object:Gem::Dependency
|
58
|
+
name: activesupport
|
59
|
+
requirement: !ruby/object:Gem::Requirement
|
60
|
+
requirements:
|
61
|
+
- - "~>"
|
62
|
+
- !ruby/object:Gem::Version
|
63
|
+
version: 5.0.1
|
64
|
+
type: :runtime
|
65
|
+
prerelease: false
|
66
|
+
version_requirements: !ruby/object:Gem::Requirement
|
67
|
+
requirements:
|
68
|
+
- - "~>"
|
69
|
+
- !ruby/object:Gem::Version
|
70
|
+
version: 5.0.1
|
71
|
+
- !ruby/object:Gem::Dependency
|
72
|
+
name: typhoeus
|
73
|
+
requirement: !ruby/object:Gem::Requirement
|
74
|
+
requirements:
|
75
|
+
- - "~>"
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: 1.1.2
|
78
|
+
type: :runtime
|
79
|
+
prerelease: false
|
80
|
+
version_requirements: !ruby/object:Gem::Requirement
|
81
|
+
requirements:
|
82
|
+
- - "~>"
|
83
|
+
- !ruby/object:Gem::Version
|
84
|
+
version: 1.1.2
|
85
|
+
- !ruby/object:Gem::Dependency
|
86
|
+
name: nokogiri
|
87
|
+
requirement: !ruby/object:Gem::Requirement
|
88
|
+
requirements:
|
89
|
+
- - "~>"
|
90
|
+
- !ruby/object:Gem::Version
|
91
|
+
version: 1.7.0
|
92
|
+
type: :runtime
|
93
|
+
prerelease: false
|
94
|
+
version_requirements: !ruby/object:Gem::Requirement
|
95
|
+
requirements:
|
96
|
+
- - "~>"
|
97
|
+
- !ruby/object:Gem::Version
|
98
|
+
version: 1.7.0
|
99
|
+
- !ruby/object:Gem::Dependency
|
100
|
+
name: thor
|
101
|
+
requirement: !ruby/object:Gem::Requirement
|
102
|
+
requirements:
|
103
|
+
- - ">="
|
104
|
+
- !ruby/object:Gem::Version
|
105
|
+
version: '0'
|
106
|
+
type: :runtime
|
107
|
+
prerelease: false
|
108
|
+
version_requirements: !ruby/object:Gem::Requirement
|
109
|
+
requirements:
|
110
|
+
- - ">="
|
111
|
+
- !ruby/object:Gem::Version
|
112
|
+
version: '0'
|
57
113
|
description: 一个简单的网络爬虫框架.
|
58
114
|
email:
|
59
115
|
- tywf91@gmail.com
|
60
|
-
executables:
|
116
|
+
executables:
|
117
|
+
- flute
|
61
118
|
extensions: []
|
62
119
|
extra_rdoc_files: []
|
63
120
|
files:
|
@@ -71,9 +128,23 @@ files:
|
|
71
128
|
- Rakefile
|
72
129
|
- bin/console
|
73
130
|
- bin/setup
|
131
|
+
- exe/flute
|
74
132
|
- flute.gemspec
|
75
133
|
- lib/flute.rb
|
134
|
+
- lib/flute/document.rb
|
135
|
+
- lib/flute/network.rb
|
136
|
+
- lib/flute/request.rb
|
137
|
+
- lib/flute/response.rb
|
138
|
+
- lib/flute/spider_base.rb
|
76
139
|
- lib/flute/version.rb
|
140
|
+
- lib/template/Gemfile
|
141
|
+
- lib/template/Rakefile
|
142
|
+
- lib/template/items/base_item.rb
|
143
|
+
- lib/template/items/rss_item.rb
|
144
|
+
- lib/template/items/web_item.rb
|
145
|
+
- lib/template/main.rb
|
146
|
+
- lib/template/middlewares/rss_middleware.rb
|
147
|
+
- lib/template/spiders/ifanr_spider.rb
|
77
148
|
homepage: https://github.com/oxoooo/flute
|
78
149
|
licenses:
|
79
150
|
- MIT
|