flute 0.1.0 → 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 96163e0495e9bc2f65b308943c132fbb622ef4a1
4
- data.tar.gz: 23092154b381dd0a02fe18d5e81aa0ab4a691d4c
3
+ metadata.gz: 17b89f3543d87eedc0830aa9e3673d45bb4e391f
4
+ data.tar.gz: feeaa5ae2b1b59701042f662c670d74b558a1822
5
5
  SHA512:
6
- metadata.gz: 1ce87363fc3566fed6590d9fcebd547bfd0fa1574d22d300bb6f32430a9dd4a20c8cbf761b7b3292b5fbbffad1c5d0a67976e9c315e0ca6785cd7accdc9c73b7
7
- data.tar.gz: 570a96fce99075b8f68aa9ab8fd968b8c997caeab78a396f6192f5de4add5d37e26321279d4f7ccf18bbef40e4255ef6126fdf562a5d59e144ec9e9c6c33e45c
6
+ metadata.gz: 1f97074135702109a1bbbc2f6a1500e1982d4e09c7ee4c1a13c06fcede28e66d7ea1bef78a935625b163a74f479f9c4c2ed2b24ce4aa2ea06b185724f17b4c5b
7
+ data.tar.gz: ccaec98172277454fd21e79d41b80b95614277f215eae5cd7594d6eb9bff1ebc9c27bb43f8eea047764f815d48b03766318831ba2f86aabb72769ced9d62f414
data/README.md CHANGED
@@ -9,23 +9,13 @@
9
9
 
10
10
  ## Installation
11
11
 
12
- Add this line to your application's Gemfile:
13
-
14
- ```ruby
15
- gem 'flute'
16
- ```
17
-
18
- And then execute:
19
-
20
- $ bundle
21
-
22
- Or install it yourself as:
23
-
24
12
  $ gem install flute
25
13
 
14
+ 之后可以使用 `flute new project_name` 来创建一个项目。
15
+
26
16
  ## Usage
27
17
 
28
- TODO
18
+ 项目包含了一个从爱范儿爬虫例子,可以进行参考,或许会有更多的实际例子发布
29
19
 
30
20
 
31
21
  ## License
data/exe/flute ADDED
@@ -0,0 +1,51 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'thor'
4
+ require 'fileutils'
5
+
6
+ class FluteCLI < Thor
7
+ desc "new NAME", "create project"
8
+ def new(name)
9
+ dirs = [
10
+ 'tmp',
11
+ 'log',
12
+ 'config',
13
+ 'lib',
14
+ 'app',
15
+ 'app/utils',
16
+ ]
17
+ create_dir(name, dirs)
18
+ FileUtils.touch "#{name}/tmp/.keep"
19
+ FileUtils.touch "#{name}/log/.keep"
20
+
21
+ copy_template 'Gemfile', "#{name}/Gemfile"
22
+ copy_template 'main.rb', "#{name}/main.rb"
23
+ copy_template 'Rakefile', "#{name}/Rakefile"
24
+
25
+ FileUtils.cp_r template_path('items'), "#{name}/app/items"
26
+ FileUtils.cp_r template_path('spiders'), "#{name}/app/spiders"
27
+ FileUtils.cp_r template_path('middlewares'), "#{name}/app/middlewares"
28
+
29
+ puts "cd #{name}"
30
+ puts "bundle install"
31
+ puts "rake start"
32
+ end
33
+
34
+ private
35
+ def create_dir(project_name, dirs)
36
+ Dir.mkdir(project_name)
37
+ dirs.each {|dir|
38
+ Dir.mkdir "#{project_name}/#{dir}"
39
+ }
40
+ end
41
+
42
+ def copy_template(name, dest)
43
+ FileUtils.cp template_path(name), dest
44
+ end
45
+
46
+ def template_path(name)
47
+ File.expand_path("../lib/template/#{name}", File.dirname(__FILE__))
48
+ end
49
+ end
50
+
51
+ FluteCLI.start(ARGV)
data/flute.gemspec CHANGED
@@ -17,11 +17,18 @@ Gem::Specification.new do |spec|
17
17
  spec.files = `git ls-files -z`.split("\x0").reject do |f|
18
18
  f.match(%r{^(test|spec|features)/})
19
19
  end
20
- spec.bindir = "exe"
21
- spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
20
+
21
+ spec.bindir = "exe"
22
+ spec.executables = ["flute"]
22
23
  spec.require_paths = ["lib"]
23
24
 
24
25
  spec.add_development_dependency "bundler", "~> 1.13"
25
26
  spec.add_development_dependency "rake", "~> 10.0"
26
27
  spec.add_development_dependency "rspec", "~> 3.0"
28
+
29
+ spec.add_dependency 'activesupport', '~> 5.0.1'
30
+ spec.add_dependency 'typhoeus', '~> 1.1.2'
31
+ spec.add_dependency 'nokogiri', '~> 1.7.0'
32
+ spec.add_dependency 'thor'
33
+
27
34
  end
@@ -0,0 +1,28 @@
1
+ module Flute
2
+ class Document < Nokogiri::HTML::Document
3
+ def og_title
4
+ og('title')
5
+ end
6
+
7
+ def og_description
8
+ og('description')
9
+ end
10
+
11
+ def og_image
12
+ og('image')
13
+ end
14
+
15
+ def og_site_name
16
+ og('site_name')
17
+ end
18
+
19
+ def description
20
+ self.css('meta[name="description"]')&.first&.attr('content')&.strip
21
+ end
22
+
23
+ def og(name)
24
+ self.css("meta[property='og:#{name}']")&.first&.attr('content')
25
+ end
26
+
27
+ end
28
+ end
@@ -0,0 +1,29 @@
1
+ module Flute
2
+ class Network
3
+ def get(url, params: {}, headers: {})
4
+ options = make_options params, headers
5
+ response = Typhoeus.get(url, options)
6
+ Response.new response
7
+ end
8
+
9
+ def request(url, params: {}, headers: {}, method: :get, body: {})
10
+ options = make_options params, headers
11
+ request = Request.new url, options.merge(method: method, body: body)
12
+ request
13
+ end
14
+
15
+ def make_options(params={}, headers={})
16
+ options = {
17
+ headers: headers,
18
+ params: params,
19
+ followlocation: true,
20
+ accept_encoding: 'gzip'
21
+ }
22
+ options
23
+ end
24
+
25
+ def create_manager(max_concurrency: 5)
26
+ Typhoeus::Hydra.new(max_concurrency: max_concurrency)
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,19 @@
1
+ module Flute
2
+ class Request < Typhoeus::Request
3
+ def meta
4
+ @meta
5
+ end
6
+
7
+ def meta=(item)
8
+ @meta = item
9
+ end
10
+
11
+ def response_html
12
+ Document.parse response.body
13
+ end
14
+
15
+ def response_json
16
+ JSON.parse(response.body).symbolize_keys
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,22 @@
1
+ module Flute
2
+ class Response
3
+ attr_reader :status, :body
4
+ def initialize(response)
5
+ @status = response.code
6
+ @body = response.body
7
+ end
8
+
9
+ def success?
10
+ 200 <= @status && @status < 300
11
+ end
12
+
13
+ def parse_html
14
+ @parse_html ||= Document.parse(@body)
15
+ end
16
+
17
+ def parse_json
18
+ @parse_json ||= JSON.parse(@body).symbolize_keys
19
+ end
20
+
21
+ end
22
+ end
@@ -0,0 +1,83 @@
1
+ module Flute
2
+ class SpiderBase
3
+
4
+ def manager_network
5
+ network
6
+ end
7
+
8
+ def network
9
+ self.class.opts[:network]
10
+ end
11
+
12
+ def manager
13
+ @manager
14
+ end
15
+
16
+ def reset_manager
17
+ @manager = manager_network.create_manager
18
+ end
19
+
20
+ def items
21
+ []
22
+ end
23
+
24
+ def before_parse(_items)
25
+ _items
26
+ end
27
+
28
+ def init_queue(_items)
29
+ reset_manager
30
+ _items
31
+ end
32
+
33
+ def parse(_items)
34
+ _items
35
+ end
36
+
37
+ def run_queue(_items)
38
+ manager.run if manager.queued_requests.size > 0
39
+ _items
40
+ end
41
+
42
+ def after_parse(_requests)
43
+ _requests
44
+ end
45
+
46
+ def pipeline
47
+ methods = self.class.opts[:middleware].map {|name| method(name)}
48
+ _items = items
49
+ methods.each { |middleware|
50
+ _items = middleware.call(_items)
51
+ }
52
+ _items
53
+ end
54
+
55
+
56
+ class << self
57
+
58
+ def opts
59
+ @opts ||= {
60
+ middleware: [:before_parse, :init_queue, :parse, :run_queue, :after_parse],
61
+ network: Network.new
62
+ }
63
+ end
64
+
65
+ def set_name(value)
66
+ opts[:name] = value
67
+ end
68
+
69
+ def start_urls(value)
70
+ opts[:start_urls] = value
71
+ end
72
+
73
+ def middleware(value)
74
+ opts[:middleware] = value
75
+ end
76
+
77
+ def set_network(value)
78
+ opts[:network] = value
79
+ end
80
+
81
+ end
82
+ end
83
+ end
data/lib/flute/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Flute
2
- VERSION = "0.1.0"
2
+ VERSION = "0.2.1"
3
3
  end
data/lib/flute.rb CHANGED
@@ -1,5 +1,15 @@
1
1
  require "flute/version"
2
2
 
3
+ require 'active_support/all'
4
+ require 'nokogiri'
5
+ require 'typhoeus'
6
+
7
+ require 'flute/document'
8
+ require 'flute/request'
9
+ require 'flute/response'
10
+ require 'flute/network'
11
+ require 'flute/spider_base'
12
+
3
13
  module Flute
4
- # Your code goes here...
14
+
5
15
  end
@@ -0,0 +1,14 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gem 'flute'
4
+
5
+ gem 'nokogiri'
6
+ gem 'activesupport', require: 'active_support/all'
7
+ gem 'typhoeus'
8
+
9
+ gem 'virtus'
10
+ gem 'feedjira'
11
+
12
+ group :development do
13
+ gem 'pry'
14
+ end
@@ -0,0 +1,21 @@
1
+ require_relative('./main')
2
+
3
+ module IRBExtensition
4
+ def reload!
5
+ puts 'reloading...'
6
+ ActiveSupport::Dependencies.clear
7
+ nil
8
+ end
9
+ end
10
+
11
+ task :console do
12
+ require 'irb'
13
+ ARGV.clear
14
+
15
+ IRB::ExtendCommandBundle.send :include, IRBExtensition
16
+ IRB.start
17
+ end
18
+
19
+ task :start do
20
+ start_ant
21
+ end
@@ -0,0 +1,5 @@
1
+ class BaseItem
2
+ include Virtus.model
3
+
4
+ attribute :url, String
5
+ end
@@ -0,0 +1,17 @@
1
+ class RssItem < BaseItem
2
+ attribute :title, String
3
+ attribute :summary, String
4
+ attribute :created, Integer
5
+ attribute :content, String
6
+ attribute :seed_id, Integer
7
+ attribute :head_name, String
8
+ attribute :head_icon, String, default: 'link'
9
+
10
+ def self.create(rss_item)
11
+ RssItem.new title: rss_item.title&.squish,
12
+ url: rss_item.url,
13
+ summary: rss_item.summary&.squish,
14
+ content: rss_item.content&.squish,
15
+ created: rss_item.published.to_i
16
+ end
17
+ end
@@ -0,0 +1,7 @@
1
+ class WebItem < BaseItem
2
+
3
+ attribute :title, String
4
+ attribute :description, String
5
+ attribute :image, String
6
+
7
+ end
@@ -0,0 +1,27 @@
1
+ env = ENV['CRAWLER_ENV'] || 'development'
2
+ AREA = ENV['AREA_ENV'] || :cn
3
+
4
+ require 'rubygems'
5
+ require 'bundler'
6
+ Bundler.require(:default, env)
7
+
8
+ $project_root = Pathname.new File.dirname(__FILE__)
9
+
10
+ load_paths = ['utils', 'items', 'middlewares', 'spiders'].map do |dir|
11
+ $project_root.join('app', dir)
12
+ end
13
+
14
+ ActiveSupport::Dependencies.autoload_paths += load_paths
15
+
16
+ def start_ant
17
+ spiders = Dir[$project_root.join('app', 'spiders', '*.rb')].map do |path|
18
+ File.basename(path, '.rb').to_s.classify.constantize
19
+ end
20
+ spiders.each do |spiders|
21
+ if env != 'production'
22
+ puts spiders.new.pipeline.map &:to_h
23
+ else
24
+ spiders.new.pipeline
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,16 @@
1
+ module RssMiddleware
2
+
3
+ def rss_items
4
+ @rss_body ||= network.get(self.class.opts[:start_urls].first).body
5
+ @rss ||= Feedjira::Feed.parse @rss_body
6
+ @rss_items ||= @rss.entries.select {|item|
7
+ !item.url.blank?
8
+ }.map {|item|
9
+ RssItem.create item
10
+ }
11
+ end
12
+
13
+ def items
14
+ rss_items[0..5]
15
+ end
16
+ end
@@ -0,0 +1,35 @@
1
+ class IfanrSpider < Flute::SpiderBase
2
+ include RssMiddleware
3
+
4
+ set_name 'ifanr'
5
+ start_urls ['http://www.ifanr.com/feed']
6
+
7
+ def before_parse(items)
8
+ items.map {|item|
9
+ WebItem.new url: item.url,
10
+ title: item.title
11
+ }
12
+ end
13
+
14
+ def parse(items)
15
+ items.map do |item|
16
+ request = network.request(item.url)
17
+ request.meta = item
18
+ manager.queue request
19
+ request
20
+ end
21
+ end
22
+
23
+ def after_parse(requests)
24
+ requests.map {|request|
25
+ html = request.response_html
26
+ item = request.meta
27
+ item.description = html.og_description
28
+ unless html.og_image.blank?
29
+ item.image = html.og_image
30
+ end
31
+ item
32
+ }
33
+ end
34
+
35
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: flute
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - 'manjia
@@ -54,10 +54,67 @@ dependencies:
54
54
  - - "~>"
55
55
  - !ruby/object:Gem::Version
56
56
  version: '3.0'
57
+ - !ruby/object:Gem::Dependency
58
+ name: activesupport
59
+ requirement: !ruby/object:Gem::Requirement
60
+ requirements:
61
+ - - "~>"
62
+ - !ruby/object:Gem::Version
63
+ version: 5.0.1
64
+ type: :runtime
65
+ prerelease: false
66
+ version_requirements: !ruby/object:Gem::Requirement
67
+ requirements:
68
+ - - "~>"
69
+ - !ruby/object:Gem::Version
70
+ version: 5.0.1
71
+ - !ruby/object:Gem::Dependency
72
+ name: typhoeus
73
+ requirement: !ruby/object:Gem::Requirement
74
+ requirements:
75
+ - - "~>"
76
+ - !ruby/object:Gem::Version
77
+ version: 1.1.2
78
+ type: :runtime
79
+ prerelease: false
80
+ version_requirements: !ruby/object:Gem::Requirement
81
+ requirements:
82
+ - - "~>"
83
+ - !ruby/object:Gem::Version
84
+ version: 1.1.2
85
+ - !ruby/object:Gem::Dependency
86
+ name: nokogiri
87
+ requirement: !ruby/object:Gem::Requirement
88
+ requirements:
89
+ - - "~>"
90
+ - !ruby/object:Gem::Version
91
+ version: 1.7.0
92
+ type: :runtime
93
+ prerelease: false
94
+ version_requirements: !ruby/object:Gem::Requirement
95
+ requirements:
96
+ - - "~>"
97
+ - !ruby/object:Gem::Version
98
+ version: 1.7.0
99
+ - !ruby/object:Gem::Dependency
100
+ name: thor
101
+ requirement: !ruby/object:Gem::Requirement
102
+ requirements:
103
+ - - ">="
104
+ - !ruby/object:Gem::Version
105
+ version: '0'
106
+ type: :runtime
107
+ prerelease: false
108
+ version_requirements: !ruby/object:Gem::Requirement
109
+ requirements:
110
+ - - ">="
111
+ - !ruby/object:Gem::Version
112
+ version: '0'
57
113
  description: 一个简单的网络爬虫框架.
58
114
  email:
59
115
  - tywf91@gmail.com
60
- executables: []
116
+ executables:
117
+ - flute
61
118
  extensions: []
62
119
  extra_rdoc_files: []
63
120
  files:
@@ -71,9 +128,23 @@ files:
71
128
  - Rakefile
72
129
  - bin/console
73
130
  - bin/setup
131
+ - exe/flute
74
132
  - flute.gemspec
75
133
  - lib/flute.rb
134
+ - lib/flute/document.rb
135
+ - lib/flute/network.rb
136
+ - lib/flute/request.rb
137
+ - lib/flute/response.rb
138
+ - lib/flute/spider_base.rb
76
139
  - lib/flute/version.rb
140
+ - lib/template/Gemfile
141
+ - lib/template/Rakefile
142
+ - lib/template/items/base_item.rb
143
+ - lib/template/items/rss_item.rb
144
+ - lib/template/items/web_item.rb
145
+ - lib/template/main.rb
146
+ - lib/template/middlewares/rss_middleware.rb
147
+ - lib/template/spiders/ifanr_spider.rb
77
148
  homepage: https://github.com/oxoooo/flute
78
149
  licenses:
79
150
  - MIT