rails_spider 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/LICENSE +165 -0
- data/README.md +33 -0
- data/app/assets/config/the_spider_manifest.js +4 -0
- data/app/assets/javascripts/the_spider/application.js +1 -0
- data/app/assets/stylesheets/the_spider/application.css +4 -0
- data/app/controllers/the_spider/application_controller.rb +9 -0
- data/app/controllers/the_spider/locals_controller.rb +62 -0
- data/app/controllers/the_spider/works_controller.rb +60 -0
- data/app/helpers/the_spider/application_helper.rb +4 -0
- data/app/helpers/the_spider/locals_helper.rb +4 -0
- data/app/helpers/the_spider/works_helper.rb +4 -0
- data/app/jobs/the_spider/application_job.rb +4 -0
- data/app/jobs/the_spider/parser_job.rb +11 -0
- data/app/jobs/the_spider/work_job.rb +11 -0
- data/app/mailers/the_spider/application_mailer.rb +6 -0
- data/app/models/rails_spider/application_record.rb +5 -0
- data/app/models/rails_spider/cookie.rb +9 -0
- data/app/models/rails_spider/failed_url.rb +7 -0
- data/app/models/rails_spider/local.rb +14 -0
- data/app/models/rails_spider/work.rb +24 -0
- data/app/views/layouts/the_spider/application.html.erb +14 -0
- data/app/views/the_spider/locals/_form.html.erb +17 -0
- data/app/views/the_spider/locals/edit.html.erb +6 -0
- data/app/views/the_spider/locals/index.html.erb +25 -0
- data/app/views/the_spider/locals/new.html.erb +5 -0
- data/app/views/the_spider/locals/show.html.erb +4 -0
- data/app/views/the_spider/works/_form.html.erb +9 -0
- data/app/views/the_spider/works/edit.html.erb +6 -0
- data/app/views/the_spider/works/index.html.erb +44 -0
- data/app/views/the_spider/works/new.html.erb +5 -0
- data/app/views/the_spider/works/show.html.erb +4 -0
- data/config/routes.rb +8 -0
- data/config/schedule.rb +35 -0
- data/db/migrate/20170502153051_rails_spider_init.rb +38 -0
- data/lib/config/config.rb +27 -0
- data/lib/config/keywords.json +22 -0
- data/lib/config/proxy.json +10 -0
- data/lib/helper/helper.rb +6 -0
- data/lib/helper/location_helper.rb +46 -0
- data/lib/helper/price_helper.rb +23 -0
- data/lib/helper/tag_helper.rb +17 -0
- data/lib/helper/text_helper.rb +41 -0
- data/lib/helper/time_helper.rb +140 -0
- data/lib/logger.rb +146 -0
- data/lib/proxy/allproxylists.txt +2366 -0
- data/lib/proxy/proxy.rb +216 -0
- data/lib/proxy/proxylists.txt +625 -0
- data/lib/rails_spider.rb +10 -0
- data/lib/rails_spider/engine.rb +9 -0
- data/lib/rails_spider/fetchers.rb +2 -0
- data/lib/rails_spider/fetchers/base.rb +146 -0
- data/lib/rails_spider/fetchers/mechanize.rb +83 -0
- data/lib/rails_spider/fetchers/witar.rb +73 -0
- data/lib/rails_spider/parser.rb +14 -0
- data/lib/rails_spider/parser/szlawyers.rb +26 -0
- data/lib/rails_spider/resource.rb +58 -0
- data/lib/rails_spider/strategies.rb +6 -0
- data/lib/rails_spider/version.rb +3 -0
- data/lib/sync_qiniu.rb +35 -0
- data/lib/sync_qiniu/getimages.rb +98 -0
- data/lib/sync_qiniu/getimages_info.rb +37 -0
- data/lib/sync_qiniu/getlocation.rb +48 -0
- data/lib/sync_qiniu/getproxy.rb +95 -0
- data/lib/tasks/the_spider_tasks.rake +4 -0
- data/rakefile +284 -0
- metadata +165 -0
@@ -0,0 +1,24 @@
|
|
1
|
+
module RailsSpider
|
2
|
+
class Work < ApplicationRecord
|
3
|
+
has_many :locals
|
4
|
+
|
5
|
+
def resource
|
6
|
+
@resource ||= Resource.new(self)
|
7
|
+
end
|
8
|
+
|
9
|
+
def run
|
10
|
+
@resource.run
|
11
|
+
end
|
12
|
+
|
13
|
+
def parser
|
14
|
+
@parser ||= self.parser_name.to_s.safe_constantize
|
15
|
+
end
|
16
|
+
|
17
|
+
def parse
|
18
|
+
locals.each do |local|
|
19
|
+
local.run
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
<!DOCTYPE html>
|
2
|
+
<html>
|
3
|
+
<head>
|
4
|
+
<title>The spider</title>
|
5
|
+
<%= stylesheet_link_tag "rails_spider/application", media: "all" %>
|
6
|
+
<%= javascript_include_tag "rails_spider/application" %>
|
7
|
+
<%= csrf_meta_tags %>
|
8
|
+
</head>
|
9
|
+
<body>
|
10
|
+
|
11
|
+
<%= yield %>
|
12
|
+
|
13
|
+
</body>
|
14
|
+
</html>
|
@@ -0,0 +1,17 @@
|
|
1
|
+
<%= form_with(model: local, local: true) do |form| %>
|
2
|
+
<% if local.errors.any? %>
|
3
|
+
<div id="error_explanation">
|
4
|
+
<h2><%= pluralize(local.errors.count, "error") %> prohibited this local from being saved:</h2>
|
5
|
+
|
6
|
+
<ul>
|
7
|
+
<% local.errors.full_messages.each do |message| %>
|
8
|
+
<li><%= message %></li>
|
9
|
+
<% end %>
|
10
|
+
</ul>
|
11
|
+
</div>
|
12
|
+
<% end %>
|
13
|
+
|
14
|
+
<div class="actions">
|
15
|
+
<%= form.submit %>
|
16
|
+
</div>
|
17
|
+
<% end %>
|
@@ -0,0 +1,25 @@
|
|
1
|
+
<p id="notice"><%= notice %></p>
|
2
|
+
|
3
|
+
<h1>Locals</h1>
|
4
|
+
|
5
|
+
<table>
|
6
|
+
<thead>
|
7
|
+
<tr>
|
8
|
+
<th colspan="3"></th>
|
9
|
+
</tr>
|
10
|
+
</thead>
|
11
|
+
|
12
|
+
<tbody>
|
13
|
+
<% @locals.each do |local| %>
|
14
|
+
<tr>
|
15
|
+
<td><%= link_to 'Show', local %></td>
|
16
|
+
<td><%= link_to 'Edit', edit_local_path(local) %></td>
|
17
|
+
<td><%= link_to 'Destroy', local, method: :delete, data: { confirm: 'Are you sure?' } %></td>
|
18
|
+
</tr>
|
19
|
+
<% end %>
|
20
|
+
</tbody>
|
21
|
+
</table>
|
22
|
+
|
23
|
+
<br>
|
24
|
+
|
25
|
+
<%= link_to 'New Local', new_local_path %>
|
@@ -0,0 +1,9 @@
|
|
1
|
+
<%= default_form_with(model: work, local: true) do |f| %>
|
2
|
+
<%= f.text_field :name %>
|
3
|
+
<%= f.text_field :parser_name %>
|
4
|
+
<%= f.text_field :host %>
|
5
|
+
<%= f.text_field :list_path %>
|
6
|
+
<%= f.text_field :item_path %>
|
7
|
+
<%= f.text_field :page_params %>
|
8
|
+
<%= f.submit %>
|
9
|
+
<% end %>
|
@@ -0,0 +1,44 @@
|
|
1
|
+
<div>
|
2
|
+
|
3
|
+
<div class="ui top attached menu borderless">
|
4
|
+
<div class="item"><strong>Users</strong></div>
|
5
|
+
</div>
|
6
|
+
|
7
|
+
<div class="ui segment top attached">
|
8
|
+
<%= link_to 'Add New', new_work_path, class: 'ui teal button' %>
|
9
|
+
</div>
|
10
|
+
|
11
|
+
<table class="ui bottom attached table">
|
12
|
+
<thead>
|
13
|
+
<tr>
|
14
|
+
<th>ID</th>
|
15
|
+
<th>Name</th>
|
16
|
+
<th>Host</th>
|
17
|
+
<th>List</th>
|
18
|
+
<th>Item</th>
|
19
|
+
<th>PageParmas</th>
|
20
|
+
<th>Disable</th>
|
21
|
+
<th></th>
|
22
|
+
<th></th>
|
23
|
+
</tr>
|
24
|
+
</thead>
|
25
|
+
|
26
|
+
<tbody>
|
27
|
+
<% @works.each do |work| %>
|
28
|
+
<tr>
|
29
|
+
<td><%= work.id %></td>
|
30
|
+
<td><%= work.name %></td>
|
31
|
+
<td><%= work.host %></td>
|
32
|
+
<td><%= work.list_path %></td>
|
33
|
+
<td><%= work.item_path %></td>
|
34
|
+
<td><%= work.page_params %></td>
|
35
|
+
<td><%= link_to '运行', run_work_path(work), method: :patch, remote: true, data: { confirm: 'Are you sure?' }, class: 'ui blue label' %></td>
|
36
|
+
<td><%= link_to 'Edit', edit_work_path(work) %></td>
|
37
|
+
<td><%= link_to 'Destroy', work, method: :delete, data: { confirm: 'Are you sure?' } %></td>
|
38
|
+
</tr>
|
39
|
+
<% end %>
|
40
|
+
</tbody>
|
41
|
+
</table>
|
42
|
+
|
43
|
+
<%= paginate @works %>
|
44
|
+
</div>
|
data/config/routes.rb
ADDED
data/config/schedule.rb
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# Use this file to easily define all of your cron jobs.
|
3
|
+
#
|
4
|
+
# It's helpful, but not entirely necessary to understand cron before proceeding.
|
5
|
+
# http://en.wikipedia.org/wiki/Cron
|
6
|
+
|
7
|
+
# Example:
|
8
|
+
#
|
9
|
+
# set :output, "/path/to/my/cron_log.log"
|
10
|
+
#
|
11
|
+
# every 2.hours do
|
12
|
+
# command "/usr/bin/some_great_command"
|
13
|
+
# runner "MyModel.some_method"
|
14
|
+
# rake "some:great:rake:task"
|
15
|
+
# end
|
16
|
+
#
|
17
|
+
# every 4.days do
|
18
|
+
# runner "AnotherModel.prune_old_
|
19
|
+
# end
|
20
|
+
|
21
|
+
# Learn more: http://github.com/javan/whenever
|
22
|
+
set :output, "#{Dir.pwd}/rake.log"
|
23
|
+
#env :PATH, ENV['PATH']
|
24
|
+
|
25
|
+
#job_type :rake, "rake everyday:update_and_sync"
|
26
|
+
#set :environment_variable, :SPIDER_ENV
|
27
|
+
#set :environment, :development
|
28
|
+
|
29
|
+
every :day, :at => '9:30pm' do
|
30
|
+
command "cd #{Dir.pwd} && rake everyday:grab_update"
|
31
|
+
end
|
32
|
+
|
33
|
+
every :day, :at => '8:30am' do
|
34
|
+
command "cd #{Dir.pwd} && rake everyday:event_image"
|
35
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
class RailsSpiderInit < ActiveRecord::Migration[5.0]
|
2
|
+
def change
|
3
|
+
|
4
|
+
create_table :rails_spider_locals do |t|
|
5
|
+
t.references :work
|
6
|
+
t.string :url
|
7
|
+
t.text :body
|
8
|
+
t.text :draft
|
9
|
+
t.timestamps
|
10
|
+
end
|
11
|
+
|
12
|
+
create_table :rails_spider_cookies do |t|
|
13
|
+
t.string :name
|
14
|
+
t.string :domain
|
15
|
+
t.string :password
|
16
|
+
t.string :value
|
17
|
+
t.timestamps
|
18
|
+
end
|
19
|
+
|
20
|
+
create_table :rails_spider_failed_urls do |t|
|
21
|
+
t.string :url
|
22
|
+
t.string :source
|
23
|
+
t.string :flat
|
24
|
+
t.timestamps
|
25
|
+
end
|
26
|
+
|
27
|
+
create_table :rails_spider_works do |t|
|
28
|
+
t.string :name
|
29
|
+
t.string :parser_name, limit: 50
|
30
|
+
t.string :host
|
31
|
+
t.string :list_path
|
32
|
+
t.string :item_path
|
33
|
+
t.string :page_params
|
34
|
+
t.timestamps
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|
38
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
require 'json'
|
2
|
+
module EventSpider
|
3
|
+
|
4
|
+
def self.configure
|
5
|
+
yield @config ||= EventSpider::Configuration.new
|
6
|
+
end
|
7
|
+
|
8
|
+
def self.config
|
9
|
+
@config
|
10
|
+
end
|
11
|
+
|
12
|
+
class Configuration
|
13
|
+
include ActiveSupport::Configurable
|
14
|
+
config_accessor :cities,
|
15
|
+
:event_class
|
16
|
+
end
|
17
|
+
|
18
|
+
configure do |config|
|
19
|
+
config.cities = ['上海', '北京', '深圳']
|
20
|
+
config.event_class = 'Event'
|
21
|
+
end
|
22
|
+
|
23
|
+
#config_path = File.expand_path('../config', __FILE__)
|
24
|
+
#PROXY = JSON.load("#{config_path}/proxy.json")
|
25
|
+
|
26
|
+
|
27
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
require 'net/http'
|
2
|
+
|
3
|
+
module LocationHelper
|
4
|
+
|
5
|
+
def transform(str)
|
6
|
+
url = URI.escape("http://api.map.baidu.com/geocoder/v2/?address=#{str}&output=json&ak=A38d59da730152d77b407446a3c0dd2b")
|
7
|
+
# Geocoding API: http://developer.baidu.com/map/webservice-geocoding.htm
|
8
|
+
# http://api.map.baidu.com/geocoder/v2/?address=%E5%BE%90%E5%AE%B6%E6%B1%87&output=json&ak=A38d59da730152d77b407446a3c0dd2b&callback=showLocation
|
9
|
+
begin
|
10
|
+
response = Net::HTTP.get_response(URI(url))
|
11
|
+
data = response.body # response may be nil when net is bad
|
12
|
+
# TODO
|
13
|
+
result = JSON.parse(data)
|
14
|
+
# status 说明文档: http://developer.baidu.com/map/webservice-geocoding.htm#.E6.8E.A5.E5.8F.A3.E7.A4.BA.E4.BE.8A
|
15
|
+
if result["status"] != 0
|
16
|
+
location = [0.0, 0.0]
|
17
|
+
else
|
18
|
+
# {"status"=>0, "result"=>{"location"=>{"lng"=>121.48026424818, "lat"=>31.229092805768}, "precise"=>1, "confidence"=>80, "level"=>"道路"}}
|
19
|
+
# puts result["result"]["location"]
|
20
|
+
location = []
|
21
|
+
location << result["result"]["location"]["lng"]
|
22
|
+
location << result["result"]["location"]["lat"]
|
23
|
+
end
|
24
|
+
return location
|
25
|
+
rescue SocketError
|
26
|
+
i ||= 0
|
27
|
+
if i <= 5
|
28
|
+
i += 1
|
29
|
+
retry
|
30
|
+
else
|
31
|
+
return [0.1, 0.1]
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
#def deal_location(location_text)
|
37
|
+
# if location_text.text != ""
|
38
|
+
# latitude = location_text.first.attr("content").to_f
|
39
|
+
# longitude = location_text.last.attr("content").to_f
|
40
|
+
# [longitude, latitude]
|
41
|
+
# else
|
42
|
+
# Location.transform(place)
|
43
|
+
# end
|
44
|
+
#end
|
45
|
+
|
46
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module PriceHelper
|
2
|
+
|
3
|
+
# example: 60 - 940元
|
4
|
+
def deal_fee(str)
|
5
|
+
fee = str.scan(/\d+/)
|
6
|
+
|
7
|
+
if str.include?("免费")
|
8
|
+
fee << "0"
|
9
|
+
end
|
10
|
+
if str.include?("FREE")
|
11
|
+
fee << "0"
|
12
|
+
end
|
13
|
+
if str.include?("未知")
|
14
|
+
fee << "未知"
|
15
|
+
end
|
16
|
+
fee = fee.uniq
|
17
|
+
fee = fee.sort_by do |f|
|
18
|
+
f.to_i
|
19
|
+
end
|
20
|
+
fee
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module TagHelper
|
2
|
+
|
3
|
+
KINDS = /音乐|戏剧|讲座|聚会|电影|展览|运动|公益|旅行|派对/
|
4
|
+
|
5
|
+
def deal_kind(str)
|
6
|
+
kinds = str.scan(KINDS).first.to_s
|
7
|
+
kinds = '其他' if kinds.blank?
|
8
|
+
kinds
|
9
|
+
end
|
10
|
+
|
11
|
+
def deal_subkind(str)
|
12
|
+
if str.include?("-")
|
13
|
+
sub_kinds = str.slice((/-/ =~ str).to_i + 1,str.length)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
end
|