rails_spider 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +165 -0
- data/README.md +33 -0
- data/app/assets/config/the_spider_manifest.js +4 -0
- data/app/assets/javascripts/the_spider/application.js +1 -0
- data/app/assets/stylesheets/the_spider/application.css +4 -0
- data/app/controllers/the_spider/application_controller.rb +9 -0
- data/app/controllers/the_spider/locals_controller.rb +62 -0
- data/app/controllers/the_spider/works_controller.rb +60 -0
- data/app/helpers/the_spider/application_helper.rb +4 -0
- data/app/helpers/the_spider/locals_helper.rb +4 -0
- data/app/helpers/the_spider/works_helper.rb +4 -0
- data/app/jobs/the_spider/application_job.rb +4 -0
- data/app/jobs/the_spider/parser_job.rb +11 -0
- data/app/jobs/the_spider/work_job.rb +11 -0
- data/app/mailers/the_spider/application_mailer.rb +6 -0
- data/app/models/rails_spider/application_record.rb +5 -0
- data/app/models/rails_spider/cookie.rb +9 -0
- data/app/models/rails_spider/failed_url.rb +7 -0
- data/app/models/rails_spider/local.rb +14 -0
- data/app/models/rails_spider/work.rb +24 -0
- data/app/views/layouts/the_spider/application.html.erb +14 -0
- data/app/views/the_spider/locals/_form.html.erb +17 -0
- data/app/views/the_spider/locals/edit.html.erb +6 -0
- data/app/views/the_spider/locals/index.html.erb +25 -0
- data/app/views/the_spider/locals/new.html.erb +5 -0
- data/app/views/the_spider/locals/show.html.erb +4 -0
- data/app/views/the_spider/works/_form.html.erb +9 -0
- data/app/views/the_spider/works/edit.html.erb +6 -0
- data/app/views/the_spider/works/index.html.erb +44 -0
- data/app/views/the_spider/works/new.html.erb +5 -0
- data/app/views/the_spider/works/show.html.erb +4 -0
- data/config/routes.rb +8 -0
- data/config/schedule.rb +35 -0
- data/db/migrate/20170502153051_rails_spider_init.rb +38 -0
- data/lib/config/config.rb +27 -0
- data/lib/config/keywords.json +22 -0
- data/lib/config/proxy.json +10 -0
- data/lib/helper/helper.rb +6 -0
- data/lib/helper/location_helper.rb +46 -0
- data/lib/helper/price_helper.rb +23 -0
- data/lib/helper/tag_helper.rb +17 -0
- data/lib/helper/text_helper.rb +41 -0
- data/lib/helper/time_helper.rb +140 -0
- data/lib/logger.rb +146 -0
- data/lib/proxy/allproxylists.txt +2366 -0
- data/lib/proxy/proxy.rb +216 -0
- data/lib/proxy/proxylists.txt +625 -0
- data/lib/rails_spider.rb +10 -0
- data/lib/rails_spider/engine.rb +9 -0
- data/lib/rails_spider/fetchers.rb +2 -0
- data/lib/rails_spider/fetchers/base.rb +146 -0
- data/lib/rails_spider/fetchers/mechanize.rb +83 -0
- data/lib/rails_spider/fetchers/witar.rb +73 -0
- data/lib/rails_spider/parser.rb +14 -0
- data/lib/rails_spider/parser/szlawyers.rb +26 -0
- data/lib/rails_spider/resource.rb +58 -0
- data/lib/rails_spider/strategies.rb +6 -0
- data/lib/rails_spider/version.rb +3 -0
- data/lib/sync_qiniu.rb +35 -0
- data/lib/sync_qiniu/getimages.rb +98 -0
- data/lib/sync_qiniu/getimages_info.rb +37 -0
- data/lib/sync_qiniu/getlocation.rb +48 -0
- data/lib/sync_qiniu/getproxy.rb +95 -0
- data/lib/tasks/the_spider_tasks.rake +4 -0
- data/rakefile +284 -0
- metadata +165 -0
@@ -0,0 +1,24 @@
|
|
1
|
+
module RailsSpider
|
2
|
+
class Work < ApplicationRecord
|
3
|
+
has_many :locals
|
4
|
+
|
5
|
+
def resource
|
6
|
+
@resource ||= Resource.new(self)
|
7
|
+
end
|
8
|
+
|
9
|
+
def run
|
10
|
+
@resource.run
|
11
|
+
end
|
12
|
+
|
13
|
+
def parser
|
14
|
+
@parser ||= self.parser_name.to_s.safe_constantize
|
15
|
+
end
|
16
|
+
|
17
|
+
def parse
|
18
|
+
locals.each do |local|
|
19
|
+
local.run
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
<!DOCTYPE html>
|
2
|
+
<html>
|
3
|
+
<head>
|
4
|
+
<title>The spider</title>
|
5
|
+
<%= stylesheet_link_tag "rails_spider/application", media: "all" %>
|
6
|
+
<%= javascript_include_tag "rails_spider/application" %>
|
7
|
+
<%= csrf_meta_tags %>
|
8
|
+
</head>
|
9
|
+
<body>
|
10
|
+
|
11
|
+
<%= yield %>
|
12
|
+
|
13
|
+
</body>
|
14
|
+
</html>
|
@@ -0,0 +1,17 @@
|
|
1
|
+
<%= form_with(model: local, local: true) do |form| %>
|
2
|
+
<% if local.errors.any? %>
|
3
|
+
<div id="error_explanation">
|
4
|
+
<h2><%= pluralize(local.errors.count, "error") %> prohibited this local from being saved:</h2>
|
5
|
+
|
6
|
+
<ul>
|
7
|
+
<% local.errors.full_messages.each do |message| %>
|
8
|
+
<li><%= message %></li>
|
9
|
+
<% end %>
|
10
|
+
</ul>
|
11
|
+
</div>
|
12
|
+
<% end %>
|
13
|
+
|
14
|
+
<div class="actions">
|
15
|
+
<%= form.submit %>
|
16
|
+
</div>
|
17
|
+
<% end %>
|
@@ -0,0 +1,25 @@
|
|
1
|
+
<p id="notice"><%= notice %></p>
|
2
|
+
|
3
|
+
<h1>Locals</h1>
|
4
|
+
|
5
|
+
<table>
|
6
|
+
<thead>
|
7
|
+
<tr>
|
8
|
+
<th colspan="3"></th>
|
9
|
+
</tr>
|
10
|
+
</thead>
|
11
|
+
|
12
|
+
<tbody>
|
13
|
+
<% @locals.each do |local| %>
|
14
|
+
<tr>
|
15
|
+
<td><%= link_to 'Show', local %></td>
|
16
|
+
<td><%= link_to 'Edit', edit_local_path(local) %></td>
|
17
|
+
<td><%= link_to 'Destroy', local, method: :delete, data: { confirm: 'Are you sure?' } %></td>
|
18
|
+
</tr>
|
19
|
+
<% end %>
|
20
|
+
</tbody>
|
21
|
+
</table>
|
22
|
+
|
23
|
+
<br>
|
24
|
+
|
25
|
+
<%= link_to 'New Local', new_local_path %>
|
@@ -0,0 +1,9 @@
|
|
1
|
+
<%= default_form_with(model: work, local: true) do |f| %>
|
2
|
+
<%= f.text_field :name %>
|
3
|
+
<%= f.text_field :parser_name %>
|
4
|
+
<%= f.text_field :host %>
|
5
|
+
<%= f.text_field :list_path %>
|
6
|
+
<%= f.text_field :item_path %>
|
7
|
+
<%= f.text_field :page_params %>
|
8
|
+
<%= f.submit %>
|
9
|
+
<% end %>
|
@@ -0,0 +1,44 @@
|
|
1
|
+
<div>
|
2
|
+
|
3
|
+
<div class="ui top attached menu borderless">
|
4
|
+
<div class="item"><strong>Users</strong></div>
|
5
|
+
</div>
|
6
|
+
|
7
|
+
<div class="ui segment top attached">
|
8
|
+
<%= link_to 'Add New', new_work_path, class: 'ui teal button' %>
|
9
|
+
</div>
|
10
|
+
|
11
|
+
<table class="ui bottom attached table">
|
12
|
+
<thead>
|
13
|
+
<tr>
|
14
|
+
<th>ID</th>
|
15
|
+
<th>Name</th>
|
16
|
+
<th>Host</th>
|
17
|
+
<th>List</th>
|
18
|
+
<th>Item</th>
|
19
|
+
<th>PageParmas</th>
|
20
|
+
<th>Disable</th>
|
21
|
+
<th></th>
|
22
|
+
<th></th>
|
23
|
+
</tr>
|
24
|
+
</thead>
|
25
|
+
|
26
|
+
<tbody>
|
27
|
+
<% @works.each do |work| %>
|
28
|
+
<tr>
|
29
|
+
<td><%= work.id %></td>
|
30
|
+
<td><%= work.name %></td>
|
31
|
+
<td><%= work.host %></td>
|
32
|
+
<td><%= work.list_path %></td>
|
33
|
+
<td><%= work.item_path %></td>
|
34
|
+
<td><%= work.page_params %></td>
|
35
|
+
<td><%= link_to '运行', run_work_path(work), method: :patch, remote: true, data: { confirm: 'Are you sure?' }, class: 'ui blue label' %></td>
|
36
|
+
<td><%= link_to 'Edit', edit_work_path(work) %></td>
|
37
|
+
<td><%= link_to 'Destroy', work, method: :delete, data: { confirm: 'Are you sure?' } %></td>
|
38
|
+
</tr>
|
39
|
+
<% end %>
|
40
|
+
</tbody>
|
41
|
+
</table>
|
42
|
+
|
43
|
+
<%= paginate @works %>
|
44
|
+
</div>
|
data/config/routes.rb
ADDED
data/config/schedule.rb
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# Use this file to easily define all of your cron jobs.
|
3
|
+
#
|
4
|
+
# It's helpful, but not entirely necessary to understand cron before proceeding.
|
5
|
+
# http://en.wikipedia.org/wiki/Cron
|
6
|
+
|
7
|
+
# Example:
|
8
|
+
#
|
9
|
+
# set :output, "/path/to/my/cron_log.log"
|
10
|
+
#
|
11
|
+
# every 2.hours do
|
12
|
+
# command "/usr/bin/some_great_command"
|
13
|
+
# runner "MyModel.some_method"
|
14
|
+
# rake "some:great:rake:task"
|
15
|
+
# end
|
16
|
+
#
|
17
|
+
# every 4.days do
|
18
|
+
# runner "AnotherModel.prune_old_
|
19
|
+
# end
|
20
|
+
|
21
|
+
# Learn more: http://github.com/javan/whenever
|
22
|
+
set :output, "#{Dir.pwd}/rake.log"
|
23
|
+
#env :PATH, ENV['PATH']
|
24
|
+
|
25
|
+
#job_type :rake, "rake everyday:update_and_sync"
|
26
|
+
#set :environment_variable, :SPIDER_ENV
|
27
|
+
#set :environment, :development
|
28
|
+
|
29
|
+
every :day, :at => '9:30pm' do
|
30
|
+
command "cd #{Dir.pwd} && rake everyday:grab_update"
|
31
|
+
end
|
32
|
+
|
33
|
+
every :day, :at => '8:30am' do
|
34
|
+
command "cd #{Dir.pwd} && rake everyday:event_image"
|
35
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
class RailsSpiderInit < ActiveRecord::Migration[5.0]
|
2
|
+
def change
|
3
|
+
|
4
|
+
create_table :rails_spider_locals do |t|
|
5
|
+
t.references :work
|
6
|
+
t.string :url
|
7
|
+
t.text :body
|
8
|
+
t.text :draft
|
9
|
+
t.timestamps
|
10
|
+
end
|
11
|
+
|
12
|
+
create_table :rails_spider_cookies do |t|
|
13
|
+
t.string :name
|
14
|
+
t.string :domain
|
15
|
+
t.string :password
|
16
|
+
t.string :value
|
17
|
+
t.timestamps
|
18
|
+
end
|
19
|
+
|
20
|
+
create_table :rails_spider_failed_urls do |t|
|
21
|
+
t.string :url
|
22
|
+
t.string :source
|
23
|
+
t.string :flat
|
24
|
+
t.timestamps
|
25
|
+
end
|
26
|
+
|
27
|
+
create_table :rails_spider_works do |t|
|
28
|
+
t.string :name
|
29
|
+
t.string :parser_name, limit: 50
|
30
|
+
t.string :host
|
31
|
+
t.string :list_path
|
32
|
+
t.string :item_path
|
33
|
+
t.string :page_params
|
34
|
+
t.timestamps
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|
38
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
require 'json'
|
2
|
+
module EventSpider
|
3
|
+
|
4
|
+
def self.configure
|
5
|
+
yield @config ||= EventSpider::Configuration.new
|
6
|
+
end
|
7
|
+
|
8
|
+
def self.config
|
9
|
+
@config
|
10
|
+
end
|
11
|
+
|
12
|
+
class Configuration
|
13
|
+
include ActiveSupport::Configurable
|
14
|
+
config_accessor :cities,
|
15
|
+
:event_class
|
16
|
+
end
|
17
|
+
|
18
|
+
configure do |config|
|
19
|
+
config.cities = ['上海', '北京', '深圳']
|
20
|
+
config.event_class = 'Event'
|
21
|
+
end
|
22
|
+
|
23
|
+
#config_path = File.expand_path('../config', __FILE__)
|
24
|
+
#PROXY = JSON.load("#{config_path}/proxy.json")
|
25
|
+
|
26
|
+
|
27
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
require 'net/http'
|
2
|
+
|
3
|
+
module LocationHelper
|
4
|
+
|
5
|
+
def transform(str)
|
6
|
+
url = URI.escape("http://api.map.baidu.com/geocoder/v2/?address=#{str}&output=json&ak=A38d59da730152d77b407446a3c0dd2b")
|
7
|
+
# Geocoding API: http://developer.baidu.com/map/webservice-geocoding.htm
|
8
|
+
# http://api.map.baidu.com/geocoder/v2/?address=%E5%BE%90%E5%AE%B6%E6%B1%87&output=json&ak=A38d59da730152d77b407446a3c0dd2b&callback=showLocation
|
9
|
+
begin
|
10
|
+
response = Net::HTTP.get_response(URI(url))
|
11
|
+
data = response.body # response may be nil when net is bad
|
12
|
+
# TODO
|
13
|
+
result = JSON.parse(data)
|
14
|
+
# status 说明文档: http://developer.baidu.com/map/webservice-geocoding.htm#.E6.8E.A5.E5.8F.A3.E7.A4.BA.E4.BE.8A
|
15
|
+
if result["status"] != 0
|
16
|
+
location = [0.0, 0.0]
|
17
|
+
else
|
18
|
+
# {"status"=>0, "result"=>{"location"=>{"lng"=>121.48026424818, "lat"=>31.229092805768}, "precise"=>1, "confidence"=>80, "level"=>"道路"}}
|
19
|
+
# puts result["result"]["location"]
|
20
|
+
location = []
|
21
|
+
location << result["result"]["location"]["lng"]
|
22
|
+
location << result["result"]["location"]["lat"]
|
23
|
+
end
|
24
|
+
return location
|
25
|
+
rescue SocketError
|
26
|
+
i ||= 0
|
27
|
+
if i <= 5
|
28
|
+
i += 1
|
29
|
+
retry
|
30
|
+
else
|
31
|
+
return [0.1, 0.1]
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
#def deal_location(location_text)
|
37
|
+
# if location_text.text != ""
|
38
|
+
# latitude = location_text.first.attr("content").to_f
|
39
|
+
# longitude = location_text.last.attr("content").to_f
|
40
|
+
# [longitude, latitude]
|
41
|
+
# else
|
42
|
+
# Location.transform(place)
|
43
|
+
# end
|
44
|
+
#end
|
45
|
+
|
46
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module PriceHelper
|
2
|
+
|
3
|
+
# example: 60 - 940元
|
4
|
+
def deal_fee(str)
|
5
|
+
fee = str.scan(/\d+/)
|
6
|
+
|
7
|
+
if str.include?("免费")
|
8
|
+
fee << "0"
|
9
|
+
end
|
10
|
+
if str.include?("FREE")
|
11
|
+
fee << "0"
|
12
|
+
end
|
13
|
+
if str.include?("未知")
|
14
|
+
fee << "未知"
|
15
|
+
end
|
16
|
+
fee = fee.uniq
|
17
|
+
fee = fee.sort_by do |f|
|
18
|
+
f.to_i
|
19
|
+
end
|
20
|
+
fee
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module TagHelper
|
2
|
+
|
3
|
+
KINDS = /音乐|戏剧|讲座|聚会|电影|展览|运动|公益|旅行|派对/
|
4
|
+
|
5
|
+
def deal_kind(str)
|
6
|
+
kinds = str.scan(KINDS).first.to_s
|
7
|
+
kinds = '其他' if kinds.blank?
|
8
|
+
kinds
|
9
|
+
end
|
10
|
+
|
11
|
+
def deal_subkind(str)
|
12
|
+
if str.include?("-")
|
13
|
+
sub_kinds = str.slice((/-/ =~ str).to_i + 1,str.length)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
end
|