flute 0.1.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 96163e0495e9bc2f65b308943c132fbb622ef4a1
4
- data.tar.gz: 23092154b381dd0a02fe18d5e81aa0ab4a691d4c
3
+ metadata.gz: 17b89f3543d87eedc0830aa9e3673d45bb4e391f
4
+ data.tar.gz: feeaa5ae2b1b59701042f662c670d74b558a1822
5
5
  SHA512:
6
- metadata.gz: 1ce87363fc3566fed6590d9fcebd547bfd0fa1574d22d300bb6f32430a9dd4a20c8cbf761b7b3292b5fbbffad1c5d0a67976e9c315e0ca6785cd7accdc9c73b7
7
- data.tar.gz: 570a96fce99075b8f68aa9ab8fd968b8c997caeab78a396f6192f5de4add5d37e26321279d4f7ccf18bbef40e4255ef6126fdf562a5d59e144ec9e9c6c33e45c
6
+ metadata.gz: 1f97074135702109a1bbbc2f6a1500e1982d4e09c7ee4c1a13c06fcede28e66d7ea1bef78a935625b163a74f479f9c4c2ed2b24ce4aa2ea06b185724f17b4c5b
7
+ data.tar.gz: ccaec98172277454fd21e79d41b80b95614277f215eae5cd7594d6eb9bff1ebc9c27bb43f8eea047764f815d48b03766318831ba2f86aabb72769ced9d62f414
data/README.md CHANGED
@@ -9,23 +9,13 @@
9
9
 
10
10
  ## Installation
11
11
 
12
- Add this line to your application's Gemfile:
13
-
14
- ```ruby
15
- gem 'flute'
16
- ```
17
-
18
- And then execute:
19
-
20
- $ bundle
21
-
22
- Or install it yourself as:
23
-
24
12
  $ gem install flute
25
13
 
14
+ 之后可以使用 `flute new project_name` 来创建一个项目。
15
+
26
16
  ## Usage
27
17
 
28
- TODO
18
+ 项目包含了一个从爱范儿爬虫例子,可以进行参考,或许会有更多的实际例子发布
29
19
 
30
20
 
31
21
  ## License
data/exe/flute ADDED
@@ -0,0 +1,51 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'thor'
4
+ require 'fileutils'
5
+
6
+ class FluteCLI < Thor
7
+ desc "new NAME", "create project"
8
+ def new(name)
9
+ dirs = [
10
+ 'tmp',
11
+ 'log',
12
+ 'config',
13
+ 'lib',
14
+ 'app',
15
+ 'app/utils',
16
+ ]
17
+ create_dir(name, dirs)
18
+ FileUtils.touch "#{name}/tmp/.keep"
19
+ FileUtils.touch "#{name}/log/.keep"
20
+
21
+ copy_template 'Gemfile', "#{name}/Gemfile"
22
+ copy_template 'main.rb', "#{name}/main.rb"
23
+ copy_template 'Rakefile', "#{name}/Rakefile"
24
+
25
+ FileUtils.cp_r template_path('items'), "#{name}/app/items"
26
+ FileUtils.cp_r template_path('spiders'), "#{name}/app/spiders"
27
+ FileUtils.cp_r template_path('middlewares'), "#{name}/app/middlewares"
28
+
29
+ puts "cd #{name}"
30
+ puts "bundle install"
31
+ puts "rake start"
32
+ end
33
+
34
+ private
35
+ def create_dir(project_name, dirs)
36
+ Dir.mkdir(project_name)
37
+ dirs.each {|dir|
38
+ Dir.mkdir "#{project_name}/#{dir}"
39
+ }
40
+ end
41
+
42
+ def copy_template(name, dest)
43
+ FileUtils.cp template_path(name), dest
44
+ end
45
+
46
+ def template_path(name)
47
+ File.expand_path("../lib/template/#{name}", File.dirname(__FILE__))
48
+ end
49
+ end
50
+
51
+ FluteCLI.start(ARGV)
data/flute.gemspec CHANGED
@@ -17,11 +17,18 @@ Gem::Specification.new do |spec|
17
17
  spec.files = `git ls-files -z`.split("\x0").reject do |f|
18
18
  f.match(%r{^(test|spec|features)/})
19
19
  end
20
- spec.bindir = "exe"
21
- spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
20
+
21
+ spec.bindir = "exe"
22
+ spec.executables = ["flute"]
22
23
  spec.require_paths = ["lib"]
23
24
 
24
25
  spec.add_development_dependency "bundler", "~> 1.13"
25
26
  spec.add_development_dependency "rake", "~> 10.0"
26
27
  spec.add_development_dependency "rspec", "~> 3.0"
28
+
29
+ spec.add_dependency 'activesupport', '~> 5.0.1'
30
+ spec.add_dependency 'typhoeus', '~> 1.1.2'
31
+ spec.add_dependency 'nokogiri', '~> 1.7.0'
32
+ spec.add_dependency 'thor'
33
+
27
34
  end
@@ -0,0 +1,28 @@
1
+ module Flute
2
+ class Document < Nokogiri::HTML::Document
3
+ def og_title
4
+ og('title')
5
+ end
6
+
7
+ def og_description
8
+ og('description')
9
+ end
10
+
11
+ def og_image
12
+ og('image')
13
+ end
14
+
15
+ def og_site_name
16
+ og('site_name')
17
+ end
18
+
19
+ def description
20
+ self.css('meta[name="description"]')&.first&.attr('content')&.strip
21
+ end
22
+
23
+ def og(name)
24
+ self.css("meta[property='og:#{name}']")&.first&.attr('content')
25
+ end
26
+
27
+ end
28
+ end
@@ -0,0 +1,29 @@
1
+ module Flute
2
+ class Network
3
+ def get(url, params: {}, headers: {})
4
+ options = make_options params, headers
5
+ response = Typhoeus.get(url, options)
6
+ Response.new response
7
+ end
8
+
9
+ def request(url, params: {}, headers: {}, method: :get, body: {})
10
+ options = make_options params, headers
11
+ request = Request.new url, options.merge(method: method, body: body)
12
+ request
13
+ end
14
+
15
+ def make_options(params={}, headers={})
16
+ options = {
17
+ headers: headers,
18
+ params: params,
19
+ followlocation: true,
20
+ accept_encoding: 'gzip'
21
+ }
22
+ options
23
+ end
24
+
25
+ def create_manager(max_concurrency: 5)
26
+ Typhoeus::Hydra.new(max_concurrency: max_concurrency)
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,19 @@
1
+ module Flute
2
+ class Request < Typhoeus::Request
3
+ def meta
4
+ @meta
5
+ end
6
+
7
+ def meta=(item)
8
+ @meta = item
9
+ end
10
+
11
+ def response_html
12
+ Document.parse response.body
13
+ end
14
+
15
+ def response_json
16
+ JSON.parse(response.body).symbolize_keys
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,22 @@
1
+ module Flute
2
+ class Response
3
+ attr_reader :status, :body
4
+ def initialize(response)
5
+ @status = response.code
6
+ @body = response.body
7
+ end
8
+
9
+ def success?
10
+ 200 <= @status && @status < 300
11
+ end
12
+
13
+ def parse_html
14
+ @parse_html ||= Document.parse(@body)
15
+ end
16
+
17
+ def parse_json
18
+ @parse_json ||= JSON.parse(@body).symbolize_keys
19
+ end
20
+
21
+ end
22
+ end
@@ -0,0 +1,83 @@
1
+ module Flute
2
+ class SpiderBase
3
+
4
+ def manager_network
5
+ network
6
+ end
7
+
8
+ def network
9
+ self.class.opts[:network]
10
+ end
11
+
12
+ def manager
13
+ @manager
14
+ end
15
+
16
+ def reset_manager
17
+ @manager = manager_network.create_manager
18
+ end
19
+
20
+ def items
21
+ []
22
+ end
23
+
24
+ def before_parse(_items)
25
+ _items
26
+ end
27
+
28
+ def init_queue(_items)
29
+ reset_manager
30
+ _items
31
+ end
32
+
33
+ def parse(_items)
34
+ _items
35
+ end
36
+
37
+ def run_queue(_items)
38
+ manager.run if manager.queued_requests.size > 0
39
+ _items
40
+ end
41
+
42
+ def after_parse(_requests)
43
+ _requests
44
+ end
45
+
46
+ def pipeline
47
+ methods = self.class.opts[:middleware].map {|name| method(name)}
48
+ _items = items
49
+ methods.each { |middleware|
50
+ _items = middleware.call(_items)
51
+ }
52
+ _items
53
+ end
54
+
55
+
56
+ class << self
57
+
58
+ def opts
59
+ @opts ||= {
60
+ middleware: [:before_parse, :init_queue, :parse, :run_queue, :after_parse],
61
+ network: Network.new
62
+ }
63
+ end
64
+
65
+ def set_name(value)
66
+ opts[:name] = value
67
+ end
68
+
69
+ def start_urls(value)
70
+ opts[:start_urls] = value
71
+ end
72
+
73
+ def middleware(value)
74
+ opts[:middleware] = value
75
+ end
76
+
77
+ def set_network(value)
78
+ opts[:network] = value
79
+ end
80
+
81
+ end
82
+ end
83
+ end
data/lib/flute/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Flute
2
- VERSION = "0.1.0"
2
+ VERSION = "0.2.1"
3
3
  end
data/lib/flute.rb CHANGED
@@ -1,5 +1,15 @@
1
1
  require "flute/version"
2
2
 
3
+ require 'active_support/all'
4
+ require 'nokogiri'
5
+ require 'typhoeus'
6
+
7
+ require 'flute/document'
8
+ require 'flute/request'
9
+ require 'flute/response'
10
+ require 'flute/network'
11
+ require 'flute/spider_base'
12
+
3
13
  module Flute
4
- # Your code goes here...
14
+
5
15
  end
@@ -0,0 +1,14 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gem 'flute'
4
+
5
+ gem 'nokogiri'
6
+ gem 'activesupport', require: 'active_support/all'
7
+ gem 'typhoeus'
8
+
9
+ gem 'virtus'
10
+ gem 'feedjira'
11
+
12
+ group :development do
13
+ gem 'pry'
14
+ end
@@ -0,0 +1,21 @@
1
+ require_relative('./main')
2
+
3
+ module IRBExtensition
4
+ def reload!
5
+ puts 'reloading...'
6
+ ActiveSupport::Dependencies.clear
7
+ nil
8
+ end
9
+ end
10
+
11
+ task :console do
12
+ require 'irb'
13
+ ARGV.clear
14
+
15
+ IRB::ExtendCommandBundle.send :include, IRBExtensition
16
+ IRB.start
17
+ end
18
+
19
+ task :start do
20
+ start_ant
21
+ end
@@ -0,0 +1,5 @@
1
+ class BaseItem
2
+ include Virtus.model
3
+
4
+ attribute :url, String
5
+ end
@@ -0,0 +1,17 @@
1
+ class RssItem < BaseItem
2
+ attribute :title, String
3
+ attribute :summary, String
4
+ attribute :created, Integer
5
+ attribute :content, String
6
+ attribute :seed_id, Integer
7
+ attribute :head_name, String
8
+ attribute :head_icon, String, default: 'link'
9
+
10
+ def self.create(rss_item)
11
+ RssItem.new title: rss_item.title&.squish,
12
+ url: rss_item.url,
13
+ summary: rss_item.summary&.squish,
14
+ content: rss_item.content&.squish,
15
+ created: rss_item.published.to_i
16
+ end
17
+ end
@@ -0,0 +1,7 @@
1
+ class WebItem < BaseItem
2
+
3
+ attribute :title, String
4
+ attribute :description, String
5
+ attribute :image, String
6
+
7
+ end
@@ -0,0 +1,27 @@
1
+ env = ENV['CRAWLER_ENV'] || 'development'
2
+ AREA = ENV['AREA_ENV'] || :cn
3
+
4
+ require 'rubygems'
5
+ require 'bundler'
6
+ Bundler.require(:default, env)
7
+
8
+ $project_root = Pathname.new File.dirname(__FILE__)
9
+
10
+ load_paths = ['utils', 'items', 'middlewares', 'spiders'].map do |dir|
11
+ $project_root.join('app', dir)
12
+ end
13
+
14
+ ActiveSupport::Dependencies.autoload_paths += load_paths
15
+
16
+ def start_ant
17
+ spiders = Dir[$project_root.join('app', 'spiders', '*.rb')].map do |path|
18
+ File.basename(path, '.rb').to_s.classify.constantize
19
+ end
20
+ spiders.each do |spiders|
21
+ if env != 'production'
22
+ puts spiders.new.pipeline.map &:to_h
23
+ else
24
+ spiders.new.pipeline
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,16 @@
1
+ module RssMiddleware
2
+
3
+ def rss_items
4
+ @rss_body ||= network.get(self.class.opts[:start_urls].first).body
5
+ @rss ||= Feedjira::Feed.parse @rss_body
6
+ @rss_items ||= @rss.entries.select {|item|
7
+ !item.url.blank?
8
+ }.map {|item|
9
+ RssItem.create item
10
+ }
11
+ end
12
+
13
+ def items
14
+ rss_items[0..5]
15
+ end
16
+ end
@@ -0,0 +1,35 @@
1
+ class IfanrSpider < Flute::SpiderBase
2
+ include RssMiddleware
3
+
4
+ set_name 'ifanr'
5
+ start_urls ['http://www.ifanr.com/feed']
6
+
7
+ def before_parse(items)
8
+ items.map {|item|
9
+ WebItem.new url: item.url,
10
+ title: item.title
11
+ }
12
+ end
13
+
14
+ def parse(items)
15
+ items.map do |item|
16
+ request = network.request(item.url)
17
+ request.meta = item
18
+ manager.queue request
19
+ request
20
+ end
21
+ end
22
+
23
+ def after_parse(requests)
24
+ requests.map {|request|
25
+ html = request.response_html
26
+ item = request.meta
27
+ item.description = html.og_description
28
+ unless html.og_image.blank?
29
+ item.image = html.og_image
30
+ end
31
+ item
32
+ }
33
+ end
34
+
35
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: flute
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - 'manjia
@@ -54,10 +54,67 @@ dependencies:
54
54
  - - "~>"
55
55
  - !ruby/object:Gem::Version
56
56
  version: '3.0'
57
+ - !ruby/object:Gem::Dependency
58
+ name: activesupport
59
+ requirement: !ruby/object:Gem::Requirement
60
+ requirements:
61
+ - - "~>"
62
+ - !ruby/object:Gem::Version
63
+ version: 5.0.1
64
+ type: :runtime
65
+ prerelease: false
66
+ version_requirements: !ruby/object:Gem::Requirement
67
+ requirements:
68
+ - - "~>"
69
+ - !ruby/object:Gem::Version
70
+ version: 5.0.1
71
+ - !ruby/object:Gem::Dependency
72
+ name: typhoeus
73
+ requirement: !ruby/object:Gem::Requirement
74
+ requirements:
75
+ - - "~>"
76
+ - !ruby/object:Gem::Version
77
+ version: 1.1.2
78
+ type: :runtime
79
+ prerelease: false
80
+ version_requirements: !ruby/object:Gem::Requirement
81
+ requirements:
82
+ - - "~>"
83
+ - !ruby/object:Gem::Version
84
+ version: 1.1.2
85
+ - !ruby/object:Gem::Dependency
86
+ name: nokogiri
87
+ requirement: !ruby/object:Gem::Requirement
88
+ requirements:
89
+ - - "~>"
90
+ - !ruby/object:Gem::Version
91
+ version: 1.7.0
92
+ type: :runtime
93
+ prerelease: false
94
+ version_requirements: !ruby/object:Gem::Requirement
95
+ requirements:
96
+ - - "~>"
97
+ - !ruby/object:Gem::Version
98
+ version: 1.7.0
99
+ - !ruby/object:Gem::Dependency
100
+ name: thor
101
+ requirement: !ruby/object:Gem::Requirement
102
+ requirements:
103
+ - - ">="
104
+ - !ruby/object:Gem::Version
105
+ version: '0'
106
+ type: :runtime
107
+ prerelease: false
108
+ version_requirements: !ruby/object:Gem::Requirement
109
+ requirements:
110
+ - - ">="
111
+ - !ruby/object:Gem::Version
112
+ version: '0'
57
113
  description: 一个简单的网络爬虫框架.
58
114
  email:
59
115
  - tywf91@gmail.com
60
- executables: []
116
+ executables:
117
+ - flute
61
118
  extensions: []
62
119
  extra_rdoc_files: []
63
120
  files:
@@ -71,9 +128,23 @@ files:
71
128
  - Rakefile
72
129
  - bin/console
73
130
  - bin/setup
131
+ - exe/flute
74
132
  - flute.gemspec
75
133
  - lib/flute.rb
134
+ - lib/flute/document.rb
135
+ - lib/flute/network.rb
136
+ - lib/flute/request.rb
137
+ - lib/flute/response.rb
138
+ - lib/flute/spider_base.rb
76
139
  - lib/flute/version.rb
140
+ - lib/template/Gemfile
141
+ - lib/template/Rakefile
142
+ - lib/template/items/base_item.rb
143
+ - lib/template/items/rss_item.rb
144
+ - lib/template/items/web_item.rb
145
+ - lib/template/main.rb
146
+ - lib/template/middlewares/rss_middleware.rb
147
+ - lib/template/spiders/ifanr_spider.rb
77
148
  homepage: https://github.com/oxoooo/flute
78
149
  licenses:
79
150
  - MIT