rails_spider 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (67) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE +165 -0
  3. data/README.md +33 -0
  4. data/app/assets/config/the_spider_manifest.js +4 -0
  5. data/app/assets/javascripts/the_spider/application.js +1 -0
  6. data/app/assets/stylesheets/the_spider/application.css +4 -0
  7. data/app/controllers/the_spider/application_controller.rb +9 -0
  8. data/app/controllers/the_spider/locals_controller.rb +62 -0
  9. data/app/controllers/the_spider/works_controller.rb +60 -0
  10. data/app/helpers/the_spider/application_helper.rb +4 -0
  11. data/app/helpers/the_spider/locals_helper.rb +4 -0
  12. data/app/helpers/the_spider/works_helper.rb +4 -0
  13. data/app/jobs/the_spider/application_job.rb +4 -0
  14. data/app/jobs/the_spider/parser_job.rb +11 -0
  15. data/app/jobs/the_spider/work_job.rb +11 -0
  16. data/app/mailers/the_spider/application_mailer.rb +6 -0
  17. data/app/models/rails_spider/application_record.rb +5 -0
  18. data/app/models/rails_spider/cookie.rb +9 -0
  19. data/app/models/rails_spider/failed_url.rb +7 -0
  20. data/app/models/rails_spider/local.rb +14 -0
  21. data/app/models/rails_spider/work.rb +24 -0
  22. data/app/views/layouts/the_spider/application.html.erb +14 -0
  23. data/app/views/the_spider/locals/_form.html.erb +17 -0
  24. data/app/views/the_spider/locals/edit.html.erb +6 -0
  25. data/app/views/the_spider/locals/index.html.erb +25 -0
  26. data/app/views/the_spider/locals/new.html.erb +5 -0
  27. data/app/views/the_spider/locals/show.html.erb +4 -0
  28. data/app/views/the_spider/works/_form.html.erb +9 -0
  29. data/app/views/the_spider/works/edit.html.erb +6 -0
  30. data/app/views/the_spider/works/index.html.erb +44 -0
  31. data/app/views/the_spider/works/new.html.erb +5 -0
  32. data/app/views/the_spider/works/show.html.erb +4 -0
  33. data/config/routes.rb +8 -0
  34. data/config/schedule.rb +35 -0
  35. data/db/migrate/20170502153051_rails_spider_init.rb +38 -0
  36. data/lib/config/config.rb +27 -0
  37. data/lib/config/keywords.json +22 -0
  38. data/lib/config/proxy.json +10 -0
  39. data/lib/helper/helper.rb +6 -0
  40. data/lib/helper/location_helper.rb +46 -0
  41. data/lib/helper/price_helper.rb +23 -0
  42. data/lib/helper/tag_helper.rb +17 -0
  43. data/lib/helper/text_helper.rb +41 -0
  44. data/lib/helper/time_helper.rb +140 -0
  45. data/lib/logger.rb +146 -0
  46. data/lib/proxy/allproxylists.txt +2366 -0
  47. data/lib/proxy/proxy.rb +216 -0
  48. data/lib/proxy/proxylists.txt +625 -0
  49. data/lib/rails_spider.rb +10 -0
  50. data/lib/rails_spider/engine.rb +9 -0
  51. data/lib/rails_spider/fetchers.rb +2 -0
  52. data/lib/rails_spider/fetchers/base.rb +146 -0
  53. data/lib/rails_spider/fetchers/mechanize.rb +83 -0
  54. data/lib/rails_spider/fetchers/witar.rb +73 -0
  55. data/lib/rails_spider/parser.rb +14 -0
  56. data/lib/rails_spider/parser/szlawyers.rb +26 -0
  57. data/lib/rails_spider/resource.rb +58 -0
  58. data/lib/rails_spider/strategies.rb +6 -0
  59. data/lib/rails_spider/version.rb +3 -0
  60. data/lib/sync_qiniu.rb +35 -0
  61. data/lib/sync_qiniu/getimages.rb +98 -0
  62. data/lib/sync_qiniu/getimages_info.rb +37 -0
  63. data/lib/sync_qiniu/getlocation.rb +48 -0
  64. data/lib/sync_qiniu/getproxy.rb +95 -0
  65. data/lib/tasks/the_spider_tasks.rake +4 -0
  66. data/rakefile +284 -0
  67. metadata +165 -0
@@ -0,0 +1,14 @@
1
+ module RailsSpider
2
+ class Local < ApplicationRecord
3
+ belongs_to :work
4
+
5
+ def parser
6
+ @parser ||= work.parser.new(self.body)
7
+ end
8
+
9
+ def run
10
+
11
+ end
12
+
13
+ end
14
+ end
@@ -0,0 +1,24 @@
1
+ module RailsSpider
2
+ class Work < ApplicationRecord
3
+ has_many :locals
4
+
5
+ def resource
6
+ @resource ||= Resource.new(self)
7
+ end
8
+
9
+ def run
10
+ @resource.run
11
+ end
12
+
13
+ def parser
14
+ @parser ||= self.parser_name.to_s.safe_constantize
15
+ end
16
+
17
+ def parse
18
+ locals.each do |local|
19
+ local.run
20
+ end
21
+ end
22
+
23
+ end
24
+ end
@@ -0,0 +1,14 @@
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <title>The spider</title>
5
+ <%= stylesheet_link_tag "rails_spider/application", media: "all" %>
6
+ <%= javascript_include_tag "rails_spider/application" %>
7
+ <%= csrf_meta_tags %>
8
+ </head>
9
+ <body>
10
+
11
+ <%= yield %>
12
+
13
+ </body>
14
+ </html>
@@ -0,0 +1,17 @@
1
+ <%= form_with(model: local, local: true) do |form| %>
2
+ <% if local.errors.any? %>
3
+ <div id="error_explanation">
4
+ <h2><%= pluralize(local.errors.count, "error") %> prohibited this local from being saved:</h2>
5
+
6
+ <ul>
7
+ <% local.errors.full_messages.each do |message| %>
8
+ <li><%= message %></li>
9
+ <% end %>
10
+ </ul>
11
+ </div>
12
+ <% end %>
13
+
14
+ <div class="actions">
15
+ <%= form.submit %>
16
+ </div>
17
+ <% end %>
@@ -0,0 +1,6 @@
1
+ <h1>Editing Local</h1>
2
+
3
+ <%= render 'form', local: @local %>
4
+
5
+ <%= link_to 'Show', @local %> |
6
+ <%= link_to 'Back', locals_path %>
@@ -0,0 +1,25 @@
1
+ <p id="notice"><%= notice %></p>
2
+
3
+ <h1>Locals</h1>
4
+
5
+ <table>
6
+ <thead>
7
+ <tr>
8
+ <th colspan="3"></th>
9
+ </tr>
10
+ </thead>
11
+
12
+ <tbody>
13
+ <% @locals.each do |local| %>
14
+ <tr>
15
+ <td><%= link_to 'Show', local %></td>
16
+ <td><%= link_to 'Edit', edit_local_path(local) %></td>
17
+ <td><%= link_to 'Destroy', local, method: :delete, data: { confirm: 'Are you sure?' } %></td>
18
+ </tr>
19
+ <% end %>
20
+ </tbody>
21
+ </table>
22
+
23
+ <br>
24
+
25
+ <%= link_to 'New Local', new_local_path %>
@@ -0,0 +1,5 @@
1
+ <h1>New Local</h1>
2
+
3
+ <%= render 'form', local: @local %>
4
+
5
+ <%= link_to 'Back', locals_path %>
@@ -0,0 +1,4 @@
1
+ <p id="notice"><%= notice %></p>
2
+
3
+ <%= link_to 'Edit', edit_local_path(@local) %> |
4
+ <%= link_to 'Back', locals_path %>
@@ -0,0 +1,9 @@
1
+ <%= default_form_with(model: work, local: true) do |f| %>
2
+ <%= f.text_field :name %>
3
+ <%= f.text_field :parser_name %>
4
+ <%= f.text_field :host %>
5
+ <%= f.text_field :list_path %>
6
+ <%= f.text_field :item_path %>
7
+ <%= f.text_field :page_params %>
8
+ <%= f.submit %>
9
+ <% end %>
@@ -0,0 +1,6 @@
1
+ <h1>Editing Work</h1>
2
+
3
+ <%= render 'form', work: @work %>
4
+
5
+ <%= link_to 'Show', @work %> |
6
+ <%= link_to 'Back', works_path %>
@@ -0,0 +1,44 @@
1
+ <div>
2
+
3
+ <div class="ui top attached menu borderless">
4
+ <div class="item"><strong>Users</strong></div>
5
+ </div>
6
+
7
+ <div class="ui segment top attached">
8
+ <%= link_to 'Add New', new_work_path, class: 'ui teal button' %>
9
+ </div>
10
+
11
+ <table class="ui bottom attached table">
12
+ <thead>
13
+ <tr>
14
+ <th>ID</th>
15
+ <th>Name</th>
16
+ <th>Host</th>
17
+ <th>List</th>
18
+ <th>Item</th>
19
+ <th>PageParmas</th>
20
+ <th>Disable</th>
21
+ <th></th>
22
+ <th></th>
23
+ </tr>
24
+ </thead>
25
+
26
+ <tbody>
27
+ <% @works.each do |work| %>
28
+ <tr>
29
+ <td><%= work.id %></td>
30
+ <td><%= work.name %></td>
31
+ <td><%= work.host %></td>
32
+ <td><%= work.list_path %></td>
33
+ <td><%= work.item_path %></td>
34
+ <td><%= work.page_params %></td>
35
+ <td><%= link_to '运行', run_work_path(work), method: :patch, remote: true, data: { confirm: 'Are you sure?' }, class: 'ui blue label' %></td>
36
+ <td><%= link_to 'Edit', edit_work_path(work) %></td>
37
+ <td><%= link_to 'Destroy', work, method: :delete, data: { confirm: 'Are you sure?' } %></td>
38
+ </tr>
39
+ <% end %>
40
+ </tbody>
41
+ </table>
42
+
43
+ <%= paginate @works %>
44
+ </div>
@@ -0,0 +1,5 @@
1
+ <h1>New Work</h1>
2
+
3
+ <%= render 'form', work: @work %>
4
+
5
+ <%= link_to 'Back', works_path %>
@@ -0,0 +1,4 @@
1
+ <p id="notice"><%= notice %></p>
2
+
3
+ <%= link_to 'Edit', edit_work_path(@work) %> |
4
+ <%= link_to 'Back', works_path %>
data/config/routes.rb ADDED
@@ -0,0 +1,8 @@
1
+ Rails.application.routes.draw do
2
+
3
+ resources :locals
4
+ resources :works do
5
+ patch :run, on: :member
6
+ end
7
+
8
+ end
@@ -0,0 +1,35 @@
1
+ # coding: utf-8
2
+ # Use this file to easily define all of your cron jobs.
3
+ #
4
+ # It's helpful, but not entirely necessary to understand cron before proceeding.
5
+ # http://en.wikipedia.org/wiki/Cron
6
+
7
+ # Example:
8
+ #
9
+ # set :output, "/path/to/my/cron_log.log"
10
+ #
11
+ # every 2.hours do
12
+ # command "/usr/bin/some_great_command"
13
+ # runner "MyModel.some_method"
14
+ # rake "some:great:rake:task"
15
+ # end
16
+ #
17
+ # every 4.days do
18
+ # runner "AnotherModel.prune_old_
19
+ # end
20
+
21
+ # Learn more: http://github.com/javan/whenever
22
+ set :output, "#{Dir.pwd}/rake.log"
23
+ #env :PATH, ENV['PATH']
24
+
25
+ #job_type :rake, "rake everyday:update_and_sync"
26
+ #set :environment_variable, :SPIDER_ENV
27
+ #set :environment, :development
28
+
29
+ every :day, :at => '9:30pm' do
30
+ command "cd #{Dir.pwd} && rake everyday:grab_update"
31
+ end
32
+
33
+ every :day, :at => '8:30am' do
34
+ command "cd #{Dir.pwd} && rake everyday:event_image"
35
+ end
@@ -0,0 +1,38 @@
1
+ class RailsSpiderInit < ActiveRecord::Migration[5.0]
2
+ def change
3
+
4
+ create_table :rails_spider_locals do |t|
5
+ t.references :work
6
+ t.string :url
7
+ t.text :body
8
+ t.text :draft
9
+ t.timestamps
10
+ end
11
+
12
+ create_table :rails_spider_cookies do |t|
13
+ t.string :name
14
+ t.string :domain
15
+ t.string :password
16
+ t.string :value
17
+ t.timestamps
18
+ end
19
+
20
+ create_table :rails_spider_failed_urls do |t|
21
+ t.string :url
22
+ t.string :source
23
+ t.string :flat
24
+ t.timestamps
25
+ end
26
+
27
+ create_table :rails_spider_works do |t|
28
+ t.string :name
29
+ t.string :parser_name, limit: 50
30
+ t.string :host
31
+ t.string :list_path
32
+ t.string :item_path
33
+ t.string :page_params
34
+ t.timestamps
35
+ end
36
+
37
+ end
38
+ end
@@ -0,0 +1,27 @@
1
+ require 'json'
2
+ module EventSpider
3
+
4
+ def self.configure
5
+ yield @config ||= EventSpider::Configuration.new
6
+ end
7
+
8
+ def self.config
9
+ @config
10
+ end
11
+
12
+ class Configuration
13
+ include ActiveSupport::Configurable
14
+ config_accessor :cities,
15
+ :event_class
16
+ end
17
+
18
+ configure do |config|
19
+ config.cities = ['上海', '北京', '深圳']
20
+ config.event_class = 'Event'
21
+ end
22
+
23
+ #config_path = File.expand_path('../config', __FILE__)
24
+ #PROXY = JSON.load("#{config_path}/proxy.json")
25
+
26
+
27
+ end
@@ -0,0 +1,22 @@
1
+ {
2
+ "keywords": [
3
+ "医院",
4
+ "性病",
5
+ "红斑痣",
6
+ "阳痿",
7
+ "早泄",
8
+ "腰椎间盘突出",
9
+ "疙瘩",
10
+ "尿酸",
11
+ "胎记",
12
+ "肛门",
13
+ "脓肿",
14
+ "遗尿症",
15
+ "血管瘤",
16
+ "尿道炎",
17
+ "前列腺",
18
+ "美白",
19
+ "眼袋",
20
+ "瘦脸"
21
+ ]
22
+ }
@@ -0,0 +1,10 @@
1
+ {
2
+ proxy: [
3
+ {ip: "107.20.237.127", port: "80"},
4
+ {ip: "202.171.253.98", port: "80"},
5
+ {ip: "85.185.149.31", port: "80"},
6
+ {ip: "115.29.164.39", port: "80"}
7
+ ]
8
+ }
9
+
10
+
@@ -0,0 +1,6 @@
1
+
2
+ require_relative 'helper/location_helper'
3
+ require_relative 'helper/price_helper'
4
+ require_relative 'helper/text_helper'
5
+ require_relative 'helper/time_helper'
6
+ require_relative 'helper/tag_helper'
@@ -0,0 +1,46 @@
1
+ require 'net/http'
2
+
3
+ module LocationHelper
4
+
5
+ def transform(str)
6
+ url = URI.escape("http://api.map.baidu.com/geocoder/v2/?address=#{str}&output=json&ak=A38d59da730152d77b407446a3c0dd2b")
7
+ # Geocoding API: http://developer.baidu.com/map/webservice-geocoding.htm
8
+ # http://api.map.baidu.com/geocoder/v2/?address=%E5%BE%90%E5%AE%B6%E6%B1%87&output=json&ak=A38d59da730152d77b407446a3c0dd2b&callback=showLocation
9
+ begin
10
+ response = Net::HTTP.get_response(URI(url))
11
+ data = response.body # response may be nil when net is bad
12
+ # TODO
13
+ result = JSON.parse(data)
14
+ # status 说明文档: http://developer.baidu.com/map/webservice-geocoding.htm#.E6.8E.A5.E5.8F.A3.E7.A4.BA.E4.BE.8A
15
+ if result["status"] != 0
16
+ location = [0.0, 0.0]
17
+ else
18
+ # {"status"=>0, "result"=>{"location"=>{"lng"=>121.48026424818, "lat"=>31.229092805768}, "precise"=>1, "confidence"=>80, "level"=>"道路"}}
19
+ # puts result["result"]["location"]
20
+ location = []
21
+ location << result["result"]["location"]["lng"]
22
+ location << result["result"]["location"]["lat"]
23
+ end
24
+ return location
25
+ rescue SocketError
26
+ i ||= 0
27
+ if i <= 5
28
+ i += 1
29
+ retry
30
+ else
31
+ return [0.1, 0.1]
32
+ end
33
+ end
34
+ end
35
+
36
+ #def deal_location(location_text)
37
+ # if location_text.text != ""
38
+ # latitude = location_text.first.attr("content").to_f
39
+ # longitude = location_text.last.attr("content").to_f
40
+ # [longitude, latitude]
41
+ # else
42
+ # Location.transform(place)
43
+ # end
44
+ #end
45
+
46
+ end
@@ -0,0 +1,23 @@
1
+ module PriceHelper
2
+
3
+ # example: 60 - 940元
4
+ def deal_fee(str)
5
+ fee = str.scan(/\d+/)
6
+
7
+ if str.include?("免费")
8
+ fee << "0"
9
+ end
10
+ if str.include?("FREE")
11
+ fee << "0"
12
+ end
13
+ if str.include?("未知")
14
+ fee << "未知"
15
+ end
16
+ fee = fee.uniq
17
+ fee = fee.sort_by do |f|
18
+ f.to_i
19
+ end
20
+ fee
21
+ end
22
+
23
+ end
@@ -0,0 +1,17 @@
1
+ module TagHelper
2
+
3
+ KINDS = /音乐|戏剧|讲座|聚会|电影|展览|运动|公益|旅行|派对/
4
+
5
+ def deal_kind(str)
6
+ kinds = str.scan(KINDS).first.to_s
7
+ kinds = '其他' if kinds.blank?
8
+ kinds
9
+ end
10
+
11
+ def deal_subkind(str)
12
+ if str.include?("-")
13
+ sub_kinds = str.slice((/-/ =~ str).to_i + 1,str.length)
14
+ end
15
+ end
16
+
17
+ end