rails_spider 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE +165 -0
  3. data/README.md +33 -0
  4. data/app/assets/config/the_spider_manifest.js +4 -0
  5. data/app/assets/javascripts/the_spider/application.js +1 -0
  6. data/app/assets/stylesheets/the_spider/application.css +4 -0
  7. data/app/controllers/the_spider/application_controller.rb +9 -0
  8. data/app/controllers/the_spider/locals_controller.rb +62 -0
  9. data/app/controllers/the_spider/works_controller.rb +60 -0
  10. data/app/helpers/the_spider/application_helper.rb +4 -0
  11. data/app/helpers/the_spider/locals_helper.rb +4 -0
  12. data/app/helpers/the_spider/works_helper.rb +4 -0
  13. data/app/jobs/the_spider/application_job.rb +4 -0
  14. data/app/jobs/the_spider/parser_job.rb +11 -0
  15. data/app/jobs/the_spider/work_job.rb +11 -0
  16. data/app/mailers/the_spider/application_mailer.rb +6 -0
  17. data/app/models/rails_spider/application_record.rb +5 -0
  18. data/app/models/rails_spider/cookie.rb +9 -0
  19. data/app/models/rails_spider/failed_url.rb +7 -0
  20. data/app/models/rails_spider/local.rb +14 -0
  21. data/app/models/rails_spider/work.rb +24 -0
  22. data/app/views/layouts/the_spider/application.html.erb +14 -0
  23. data/app/views/the_spider/locals/_form.html.erb +17 -0
  24. data/app/views/the_spider/locals/edit.html.erb +6 -0
  25. data/app/views/the_spider/locals/index.html.erb +25 -0
  26. data/app/views/the_spider/locals/new.html.erb +5 -0
  27. data/app/views/the_spider/locals/show.html.erb +4 -0
  28. data/app/views/the_spider/works/_form.html.erb +9 -0
  29. data/app/views/the_spider/works/edit.html.erb +6 -0
  30. data/app/views/the_spider/works/index.html.erb +44 -0
  31. data/app/views/the_spider/works/new.html.erb +5 -0
  32. data/app/views/the_spider/works/show.html.erb +4 -0
  33. data/config/routes.rb +8 -0
  34. data/config/schedule.rb +35 -0
  35. data/db/migrate/20170502153051_rails_spider_init.rb +38 -0
  36. data/lib/config/config.rb +27 -0
  37. data/lib/config/keywords.json +22 -0
  38. data/lib/config/proxy.json +10 -0
  39. data/lib/helper/helper.rb +6 -0
  40. data/lib/helper/location_helper.rb +46 -0
  41. data/lib/helper/price_helper.rb +23 -0
  42. data/lib/helper/tag_helper.rb +17 -0
  43. data/lib/helper/text_helper.rb +41 -0
  44. data/lib/helper/time_helper.rb +140 -0
  45. data/lib/logger.rb +146 -0
  46. data/lib/proxy/allproxylists.txt +2366 -0
  47. data/lib/proxy/proxy.rb +216 -0
  48. data/lib/proxy/proxylists.txt +625 -0
  49. data/lib/rails_spider.rb +10 -0
  50. data/lib/rails_spider/engine.rb +9 -0
  51. data/lib/rails_spider/fetchers.rb +2 -0
  52. data/lib/rails_spider/fetchers/base.rb +146 -0
  53. data/lib/rails_spider/fetchers/mechanize.rb +83 -0
  54. data/lib/rails_spider/fetchers/witar.rb +73 -0
  55. data/lib/rails_spider/parser.rb +14 -0
  56. data/lib/rails_spider/parser/szlawyers.rb +26 -0
  57. data/lib/rails_spider/resource.rb +58 -0
  58. data/lib/rails_spider/strategies.rb +6 -0
  59. data/lib/rails_spider/version.rb +3 -0
  60. data/lib/sync_qiniu.rb +35 -0
  61. data/lib/sync_qiniu/getimages.rb +98 -0
  62. data/lib/sync_qiniu/getimages_info.rb +37 -0
  63. data/lib/sync_qiniu/getlocation.rb +48 -0
  64. data/lib/sync_qiniu/getproxy.rb +95 -0
  65. data/lib/tasks/the_spider_tasks.rake +4 -0
  66. data/rakefile +284 -0
  67. metadata +165 -0
@@ -0,0 +1,14 @@
1
+ module RailsSpider
2
+ class Local < ApplicationRecord
3
+ belongs_to :work
4
+
5
+ def parser
6
+ @parser ||= work.parser.new(self.body)
7
+ end
8
+
9
+ def run
10
+
11
+ end
12
+
13
+ end
14
+ end
@@ -0,0 +1,24 @@
1
+ module RailsSpider
2
+ class Work < ApplicationRecord
3
+ has_many :locals
4
+
5
+ def resource
6
+ @resource ||= Resource.new(self)
7
+ end
8
+
9
+ def run
10
+ @resource.run
11
+ end
12
+
13
+ def parser
14
+ @parser ||= self.parser_name.to_s.safe_constantize
15
+ end
16
+
17
+ def parse
18
+ locals.each do |local|
19
+ local.run
20
+ end
21
+ end
22
+
23
+ end
24
+ end
@@ -0,0 +1,14 @@
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <title>The spider</title>
5
+ <%= stylesheet_link_tag "rails_spider/application", media: "all" %>
6
+ <%= javascript_include_tag "rails_spider/application" %>
7
+ <%= csrf_meta_tags %>
8
+ </head>
9
+ <body>
10
+
11
+ <%= yield %>
12
+
13
+ </body>
14
+ </html>
@@ -0,0 +1,17 @@
1
+ <%= form_with(model: local, local: true) do |form| %>
2
+ <% if local.errors.any? %>
3
+ <div id="error_explanation">
4
+ <h2><%= pluralize(local.errors.count, "error") %> prohibited this local from being saved:</h2>
5
+
6
+ <ul>
7
+ <% local.errors.full_messages.each do |message| %>
8
+ <li><%= message %></li>
9
+ <% end %>
10
+ </ul>
11
+ </div>
12
+ <% end %>
13
+
14
+ <div class="actions">
15
+ <%= form.submit %>
16
+ </div>
17
+ <% end %>
@@ -0,0 +1,6 @@
1
+ <h1>Editing Local</h1>
2
+
3
+ <%= render 'form', local: @local %>
4
+
5
+ <%= link_to 'Show', @local %> |
6
+ <%= link_to 'Back', locals_path %>
@@ -0,0 +1,25 @@
1
+ <p id="notice"><%= notice %></p>
2
+
3
+ <h1>Locals</h1>
4
+
5
+ <table>
6
+ <thead>
7
+ <tr>
8
+ <th colspan="3"></th>
9
+ </tr>
10
+ </thead>
11
+
12
+ <tbody>
13
+ <% @locals.each do |local| %>
14
+ <tr>
15
+ <td><%= link_to 'Show', local %></td>
16
+ <td><%= link_to 'Edit', edit_local_path(local) %></td>
17
+ <td><%= link_to 'Destroy', local, method: :delete, data: { confirm: 'Are you sure?' } %></td>
18
+ </tr>
19
+ <% end %>
20
+ </tbody>
21
+ </table>
22
+
23
+ <br>
24
+
25
+ <%= link_to 'New Local', new_local_path %>
@@ -0,0 +1,5 @@
1
+ <h1>New Local</h1>
2
+
3
+ <%= render 'form', local: @local %>
4
+
5
+ <%= link_to 'Back', locals_path %>
@@ -0,0 +1,4 @@
1
+ <p id="notice"><%= notice %></p>
2
+
3
+ <%= link_to 'Edit', edit_local_path(@local) %> |
4
+ <%= link_to 'Back', locals_path %>
@@ -0,0 +1,9 @@
1
+ <%= default_form_with(model: work, local: true) do |f| %>
2
+ <%= f.text_field :name %>
3
+ <%= f.text_field :parser_name %>
4
+ <%= f.text_field :host %>
5
+ <%= f.text_field :list_path %>
6
+ <%= f.text_field :item_path %>
7
+ <%= f.text_field :page_params %>
8
+ <%= f.submit %>
9
+ <% end %>
@@ -0,0 +1,6 @@
1
+ <h1>Editing Work</h1>
2
+
3
+ <%= render 'form', work: @work %>
4
+
5
+ <%= link_to 'Show', @work %> |
6
+ <%= link_to 'Back', works_path %>
@@ -0,0 +1,44 @@
1
+ <div>
2
+
3
+ <div class="ui top attached menu borderless">
4
+ <div class="item"><strong>Users</strong></div>
5
+ </div>
6
+
7
+ <div class="ui segment top attached">
8
+ <%= link_to 'Add New', new_work_path, class: 'ui teal button' %>
9
+ </div>
10
+
11
+ <table class="ui bottom attached table">
12
+ <thead>
13
+ <tr>
14
+ <th>ID</th>
15
+ <th>Name</th>
16
+ <th>Host</th>
17
+ <th>List</th>
18
+ <th>Item</th>
19
+ <th>PageParmas</th>
20
+ <th>Disable</th>
21
+ <th></th>
22
+ <th></th>
23
+ </tr>
24
+ </thead>
25
+
26
+ <tbody>
27
+ <% @works.each do |work| %>
28
+ <tr>
29
+ <td><%= work.id %></td>
30
+ <td><%= work.name %></td>
31
+ <td><%= work.host %></td>
32
+ <td><%= work.list_path %></td>
33
+ <td><%= work.item_path %></td>
34
+ <td><%= work.page_params %></td>
35
+ <td><%= link_to '运行', run_work_path(work), method: :patch, remote: true, data: { confirm: 'Are you sure?' }, class: 'ui blue label' %></td>
36
+ <td><%= link_to 'Edit', edit_work_path(work) %></td>
37
+ <td><%= link_to 'Destroy', work, method: :delete, data: { confirm: 'Are you sure?' } %></td>
38
+ </tr>
39
+ <% end %>
40
+ </tbody>
41
+ </table>
42
+
43
+ <%= paginate @works %>
44
+ </div>
@@ -0,0 +1,5 @@
1
+ <h1>New Work</h1>
2
+
3
+ <%= render 'form', work: @work %>
4
+
5
+ <%= link_to 'Back', works_path %>
@@ -0,0 +1,4 @@
1
+ <p id="notice"><%= notice %></p>
2
+
3
+ <%= link_to 'Edit', edit_work_path(@work) %> |
4
+ <%= link_to 'Back', works_path %>
data/config/routes.rb ADDED
@@ -0,0 +1,8 @@
1
+ Rails.application.routes.draw do
2
+
3
+ resources :locals
4
+ resources :works do
5
+ patch :run, on: :member
6
+ end
7
+
8
+ end
@@ -0,0 +1,35 @@
1
+ # coding: utf-8
2
+ # Use this file to easily define all of your cron jobs.
3
+ #
4
+ # It's helpful, but not entirely necessary to understand cron before proceeding.
5
+ # http://en.wikipedia.org/wiki/Cron
6
+
7
+ # Example:
8
+ #
9
+ # set :output, "/path/to/my/cron_log.log"
10
+ #
11
+ # every 2.hours do
12
+ # command "/usr/bin/some_great_command"
13
+ # runner "MyModel.some_method"
14
+ # rake "some:great:rake:task"
15
+ # end
16
+ #
17
+ # every 4.days do
18
+ # runner "AnotherModel.prune_old_
19
+ # end
20
+
21
+ # Learn more: http://github.com/javan/whenever
22
+ set :output, "#{Dir.pwd}/rake.log"
23
+ #env :PATH, ENV['PATH']
24
+
25
+ #job_type :rake, "rake everyday:update_and_sync"
26
+ #set :environment_variable, :SPIDER_ENV
27
+ #set :environment, :development
28
+
29
+ every :day, :at => '9:30pm' do
30
+ command "cd #{Dir.pwd} && rake everyday:grab_update"
31
+ end
32
+
33
+ every :day, :at => '8:30am' do
34
+ command "cd #{Dir.pwd} && rake everyday:event_image"
35
+ end
@@ -0,0 +1,38 @@
1
+ class RailsSpiderInit < ActiveRecord::Migration[5.0]
2
+ def change
3
+
4
+ create_table :rails_spider_locals do |t|
5
+ t.references :work
6
+ t.string :url
7
+ t.text :body
8
+ t.text :draft
9
+ t.timestamps
10
+ end
11
+
12
+ create_table :rails_spider_cookies do |t|
13
+ t.string :name
14
+ t.string :domain
15
+ t.string :password
16
+ t.string :value
17
+ t.timestamps
18
+ end
19
+
20
+ create_table :rails_spider_failed_urls do |t|
21
+ t.string :url
22
+ t.string :source
23
+ t.string :flat
24
+ t.timestamps
25
+ end
26
+
27
+ create_table :rails_spider_works do |t|
28
+ t.string :name
29
+ t.string :parser_name, limit: 50
30
+ t.string :host
31
+ t.string :list_path
32
+ t.string :item_path
33
+ t.string :page_params
34
+ t.timestamps
35
+ end
36
+
37
+ end
38
+ end
@@ -0,0 +1,27 @@
1
+ require 'json'
2
+ module EventSpider
3
+
4
+ def self.configure
5
+ yield @config ||= EventSpider::Configuration.new
6
+ end
7
+
8
+ def self.config
9
+ @config
10
+ end
11
+
12
+ class Configuration
13
+ include ActiveSupport::Configurable
14
+ config_accessor :cities,
15
+ :event_class
16
+ end
17
+
18
+ configure do |config|
19
+ config.cities = ['上海', '北京', '深圳']
20
+ config.event_class = 'Event'
21
+ end
22
+
23
+ #config_path = File.expand_path('../config', __FILE__)
24
+ #PROXY = JSON.load("#{config_path}/proxy.json")
25
+
26
+
27
+ end
@@ -0,0 +1,22 @@
1
+ {
2
+ "keywords": [
3
+ "医院",
4
+ "性病",
5
+ "红斑痣",
6
+ "阳痿",
7
+ "早泄",
8
+ "腰椎间盘突出",
9
+ "疙瘩",
10
+ "尿酸",
11
+ "胎记",
12
+ "肛门",
13
+ "脓肿",
14
+ "遗尿症",
15
+ "血管瘤",
16
+ "尿道炎",
17
+ "前列腺",
18
+ "美白",
19
+ "眼袋",
20
+ "瘦脸"
21
+ ]
22
+ }
@@ -0,0 +1,10 @@
1
+ {
2
+ proxy: [
3
+ {ip: "107.20.237.127", port: "80"},
4
+ {ip: "202.171.253.98", port: "80"},
5
+ {ip: "85.185.149.31", port: "80"},
6
+ {ip: "115.29.164.39", port: "80"}
7
+ ]
8
+ }
9
+
10
+
@@ -0,0 +1,6 @@
1
+
2
+ require_relative 'helper/location_helper'
3
+ require_relative 'helper/price_helper'
4
+ require_relative 'helper/text_helper'
5
+ require_relative 'helper/time_helper'
6
+ require_relative 'helper/tag_helper'
@@ -0,0 +1,46 @@
1
+ require 'net/http'
2
+
3
+ module LocationHelper
4
+
5
+ def transform(str)
6
+ url = URI.escape("http://api.map.baidu.com/geocoder/v2/?address=#{str}&output=json&ak=A38d59da730152d77b407446a3c0dd2b")
7
+ # Geocoding API: http://developer.baidu.com/map/webservice-geocoding.htm
8
+ # http://api.map.baidu.com/geocoder/v2/?address=%E5%BE%90%E5%AE%B6%E6%B1%87&output=json&ak=A38d59da730152d77b407446a3c0dd2b&callback=showLocation
9
+ begin
10
+ response = Net::HTTP.get_response(URI(url))
11
+ data = response.body # response may be nil when net is bad
12
+ # TODO
13
+ result = JSON.parse(data)
14
+ # status 说明文档: http://developer.baidu.com/map/webservice-geocoding.htm#.E6.8E.A5.E5.8F.A3.E7.A4.BA.E4.BE.8A
15
+ if result["status"] != 0
16
+ location = [0.0, 0.0]
17
+ else
18
+ # {"status"=>0, "result"=>{"location"=>{"lng"=>121.48026424818, "lat"=>31.229092805768}, "precise"=>1, "confidence"=>80, "level"=>"道路"}}
19
+ # puts result["result"]["location"]
20
+ location = []
21
+ location << result["result"]["location"]["lng"]
22
+ location << result["result"]["location"]["lat"]
23
+ end
24
+ return location
25
+ rescue SocketError
26
+ i ||= 0
27
+ if i <= 5
28
+ i += 1
29
+ retry
30
+ else
31
+ return [0.1, 0.1]
32
+ end
33
+ end
34
+ end
35
+
36
+ #def deal_location(location_text)
37
+ # if location_text.text != ""
38
+ # latitude = location_text.first.attr("content").to_f
39
+ # longitude = location_text.last.attr("content").to_f
40
+ # [longitude, latitude]
41
+ # else
42
+ # Location.transform(place)
43
+ # end
44
+ #end
45
+
46
+ end
@@ -0,0 +1,23 @@
1
+ module PriceHelper
2
+
3
+ # example: 60 - 940元
4
+ def deal_fee(str)
5
+ fee = str.scan(/\d+/)
6
+
7
+ if str.include?("免费")
8
+ fee << "0"
9
+ end
10
+ if str.include?("FREE")
11
+ fee << "0"
12
+ end
13
+ if str.include?("未知")
14
+ fee << "未知"
15
+ end
16
+ fee = fee.uniq
17
+ fee = fee.sort_by do |f|
18
+ f.to_i
19
+ end
20
+ fee
21
+ end
22
+
23
+ end
@@ -0,0 +1,17 @@
1
+ module TagHelper
2
+
3
+ KINDS = /音乐|戏剧|讲座|聚会|电影|展览|运动|公益|旅行|派对/
4
+
5
+ def deal_kind(str)
6
+ kinds = str.scan(KINDS).first.to_s
7
+ kinds = '其他' if kinds.blank?
8
+ kinds
9
+ end
10
+
11
+ def deal_subkind(str)
12
+ if str.include?("-")
13
+ sub_kinds = str.slice((/-/ =~ str).to_i + 1,str.length)
14
+ end
15
+ end
16
+
17
+ end