pba_crawler 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +4 -0
- data/Gemfile +4 -0
- data/Rakefile +2 -0
- data/app/controllers/proxies_controller.rb +39 -0
- data/app/models/proxy.rb +3 -0
- data/app/views/proxies/_form.html.erb +8 -0
- data/app/views/proxies/edit.html.erb +6 -0
- data/app/views/proxies/index.html.erb +24 -0
- data/app/views/proxies/new.html.erb +5 -0
- data/config/routes.rb +3 -0
- data/db/migrate/20110321122537_create_proxy.rb +14 -0
- data/lib/generators/crawler/install/install_generator.rb +20 -0
- data/lib/pba_crawler.rb +10 -0
- data/lib/pba_crawler/file_utilz.rb +73 -0
- data/lib/pba_crawler/generic.rb +78 -0
- data/lib/pba_crawler/version.rb +3 -0
- data/lib/pba_crawler_engine.rb +5 -0
- data/pba_crawler.gemspec +25 -0
- data/readme.textile +16 -0
- metadata +107 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/Rakefile
ADDED
@@ -0,0 +1,39 @@
|
|
1
|
+
class ProxiesController < ApplicationController
|
2
|
+
|
3
|
+
def index
|
4
|
+
@proxies = Proxy.all
|
5
|
+
end
|
6
|
+
|
7
|
+
def new
|
8
|
+
@proxy = Proxy.new
|
9
|
+
end
|
10
|
+
|
11
|
+
def edit
|
12
|
+
@proxy = Proxy.find(params[:id])
|
13
|
+
end
|
14
|
+
|
15
|
+
def create
|
16
|
+
@proxy = Proxy.new(params[:proxy])
|
17
|
+
if @proxy.save
|
18
|
+
redirect_to(proxies_url, :notice => 'Proxy was successfully created.')
|
19
|
+
else
|
20
|
+
render :action => "new"
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def update
|
25
|
+
@proxy = Proxy.find(params[:id])
|
26
|
+
if @proxy.update_attributes(params[:proxy])
|
27
|
+
redirect_to(proxies_url, :notice => 'Proxy was successfully updated.')
|
28
|
+
else
|
29
|
+
render :action => "edit"
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def destroy
|
34
|
+
@proxy = Proxy.find(params[:id])
|
35
|
+
@proxy.destroy
|
36
|
+
|
37
|
+
redirect_to(proxies_url)
|
38
|
+
end
|
39
|
+
end
|
data/app/models/proxy.rb
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
<h1>Listing proxies</h1>
|
2
|
+
|
3
|
+
<table>
|
4
|
+
<tr>
|
5
|
+
<th>Address</th>
|
6
|
+
<th>IP</th>
|
7
|
+
<th></th>
|
8
|
+
<th></th>
|
9
|
+
<th></th>
|
10
|
+
</tr>
|
11
|
+
|
12
|
+
<% @proxies.each do |proxy| %>
|
13
|
+
<tr>
|
14
|
+
<td><%= proxy.address %></td>
|
15
|
+
<td><%= proxy.port %></td>
|
16
|
+
<td><%= link_to 'Show', proxy %></td>
|
17
|
+
<td><%= link_to 'Edit', edit_proxy_path(proxy) %></td>
|
18
|
+
<td><%= link_to 'Destroy', proxy, :confirm => 'Are you sure?', :method => :delete %></td>
|
19
|
+
</tr>
|
20
|
+
<% end %>
|
21
|
+
</table>
|
22
|
+
<br />
|
23
|
+
|
24
|
+
<%= link_to 'New Proxy', new_proxy_path %>
|
data/config/routes.rb
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
class CreateProxy < ActiveRecord::Migration
|
2
|
+
def self.up
|
3
|
+
create_table :proxies do |t|
|
4
|
+
t.string :address
|
5
|
+
t.integer :port
|
6
|
+
t.datetime :last_request_at
|
7
|
+
t.integer :position
|
8
|
+
t.timestamps
|
9
|
+
end
|
10
|
+
end
|
11
|
+
def self.down
|
12
|
+
drop_table :proxies
|
13
|
+
end
|
14
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require "crawler/file_utilz.rb"
|
3
|
+
|
4
|
+
module Crawler
|
5
|
+
# Copies database migration files
|
6
|
+
#
|
7
|
+
# @example
|
8
|
+
# $ rails generate crawler:install
|
9
|
+
#
|
10
|
+
# @todo Revisit in Rails 3.1 where public assets are treated differently
|
11
|
+
class InstallGenerator < Rails::Generators::Base
|
12
|
+
desc "Copies database migration files"
|
13
|
+
|
14
|
+
def copy_files
|
15
|
+
source = File.join(File.dirname(__FILE__),'..','..', '..', '..', 'db')
|
16
|
+
destination = File.join(Rails.root, 'db')
|
17
|
+
Crawler::FileUtilz.mirror_files(source, destination)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
data/lib/pba_crawler.rb
ADDED
@@ -0,0 +1,73 @@
|
|
1
|
+
require 'fileutils'
|
2
|
+
|
3
|
+
module PbaCrawler
|
4
|
+
class FileUtilz
|
5
|
+
# A general purpose method to mirror a directory (+source+) into a destination
|
6
|
+
# directory, including all files and subdirectories. Files will not be mirrored
|
7
|
+
# if they are identical already (checked via FileUtils#identical?).
|
8
|
+
#
|
9
|
+
# Copyright (c) 2008 James Adam (The MIT License)
|
10
|
+
|
11
|
+
# added: do mirroring with backup saving turned on
|
12
|
+
def self.mirror_with_backup(source, destination)
|
13
|
+
self.mirror_files(source, destination, true)
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.mirror_files(source, destination, create_backups = false)
|
17
|
+
# TODO: use Rake::FileList#pathmap?
|
18
|
+
if File.directory?(source)
|
19
|
+
source_files = Dir[source + "/**/*"]
|
20
|
+
source_dirs = source_files.select { |d| File.directory?(d) }
|
21
|
+
source_files -= source_dirs
|
22
|
+
elsif File.exist? source
|
23
|
+
source_dirs = []
|
24
|
+
source_files = [source]
|
25
|
+
source = File.dirname source
|
26
|
+
else
|
27
|
+
puts "Could not mirror #{source} - entity does not exist"
|
28
|
+
return
|
29
|
+
end
|
30
|
+
|
31
|
+
unless source_files.empty?
|
32
|
+
base_target_dir = File.join(destination)
|
33
|
+
base_target_dir = File.dirname base_target_dir unless File.directory? base_target_dir
|
34
|
+
FileUtils.mkdir_p(base_target_dir)
|
35
|
+
end
|
36
|
+
|
37
|
+
base_target_dir ||= File.join(destination)
|
38
|
+
|
39
|
+
source_dirs.each do |dir|
|
40
|
+
# strip down these paths so we have simple, relative paths we can
|
41
|
+
# add to the destination
|
42
|
+
target_dir = File.join(base_target_dir, dir.gsub(source, ''))
|
43
|
+
begin
|
44
|
+
FileUtils.mkdir_p(target_dir)
|
45
|
+
rescue Exception => e
|
46
|
+
raise "Could not create directory #{target_dir}: \n" + e
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
source_files.each do |file|
|
51
|
+
begin
|
52
|
+
target = File.join(base_target_dir, file.gsub(source, ''))
|
53
|
+
unless File.exist?(target) && self.same_contents(file, target)
|
54
|
+
# WAS FileUtils.identical?(file, target), but we want to test contents too
|
55
|
+
if create_backups && File.exist?(target)
|
56
|
+
File.rename(target, target + '~')
|
57
|
+
end
|
58
|
+
FileUtils.cp(file, target)
|
59
|
+
end
|
60
|
+
rescue Exception => e
|
61
|
+
raise "Could not copy #{file} to #{target}: \n" + e
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
# for windows users...
|
67
|
+
def self.same_contents(source,destination)
|
68
|
+
File.read(source) == File.read(destination)
|
69
|
+
end
|
70
|
+
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
@@ -0,0 +1,78 @@
|
|
1
|
+
module PbaCrawler
|
2
|
+
class Generic
|
3
|
+
|
4
|
+
#list of proxies address
|
5
|
+
@@proxies = ""
|
6
|
+
@@cursor = 0
|
7
|
+
@@proxy = nil
|
8
|
+
|
9
|
+
def initialize
|
10
|
+
#load proxy list
|
11
|
+
@@proxies = Proxy.order(:position)
|
12
|
+
end
|
13
|
+
|
14
|
+
def crawl_url(url)
|
15
|
+
agent = Mechanize.new { |a|
|
16
|
+
a.html_parser = Nokogiri::HTML
|
17
|
+
a.open_timeout = 20
|
18
|
+
a.read_timeout = 20
|
19
|
+
}
|
20
|
+
@@proxy = @@proxies.fetch(@@cursor)
|
21
|
+
agent.set_proxy(@@proxy.address,@@proxy.port)
|
22
|
+
agent.user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 6.1; nl; rv:1.9.2.13) Gecko/20101203 Firefox/3.6.13'
|
23
|
+
|
24
|
+
get_url(agent,url)
|
25
|
+
end
|
26
|
+
|
27
|
+
def get_url(agent,url)
|
28
|
+
begin
|
29
|
+
response = agent.get(url)
|
30
|
+
if response
|
31
|
+
@@proxy.update_attributes(:last_request_at => Time.now)
|
32
|
+
else
|
33
|
+
rescue_get_url(agent,url)
|
34
|
+
end
|
35
|
+
rescue
|
36
|
+
rescue_get_url(agent,url)
|
37
|
+
end
|
38
|
+
response
|
39
|
+
end
|
40
|
+
|
41
|
+
def rescue_get_url(agent,url)
|
42
|
+
if @@cursor + 1 < @@proxies.size
|
43
|
+
p "#{agent.proxy_addr} #{agent.proxy_port}"
|
44
|
+
@@cursor = @@cursor + 1
|
45
|
+
@@proxy = @@proxies.fetch(@@cursor)
|
46
|
+
agent.set_proxy(@@proxy.address,@@proxy.port)
|
47
|
+
response = get_url(agent,url)
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
def load_sample
|
52
|
+
list = [
|
53
|
+
["101.0.6.87","31280"],
|
54
|
+
["109.120.130.2","3128"],
|
55
|
+
["62.215.5.69","8080"],
|
56
|
+
["62.215.5.66","8080"],
|
57
|
+
["212.33.199.19","80"],
|
58
|
+
["196.1.70.201","80"],
|
59
|
+
["194.87.32.233","80"],
|
60
|
+
["94.125.27.20","8080"],
|
61
|
+
["95.66.35.1","80"],
|
62
|
+
["124.164.247.43","3128"],
|
63
|
+
["186.225.98.17","3128"],
|
64
|
+
["122.155.13.135","3128"],
|
65
|
+
["148.81.130.19","80"],
|
66
|
+
["202.129.58.68","80"],
|
67
|
+
["97.100.143.20","27977"],
|
68
|
+
["24.12.13.154","27977"],
|
69
|
+
["200.129.180.61","80"]
|
70
|
+
]
|
71
|
+
Proxy.delete_all
|
72
|
+
list.each do |add|
|
73
|
+
Proxy.create(:address => add[0], :port => add[1].to_i, :last_request_at => Time.now)
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
end
|
78
|
+
end
|
data/pba_crawler.gemspec
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "pba_crawler/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "pba_crawler"
|
7
|
+
s.version = PbaCrawler::VERSION
|
8
|
+
s.platform = Gem::Platform::RUBY
|
9
|
+
s.authors = ["Pierre BASILE"]
|
10
|
+
s.email = ["pierre.basile@gmail.com"]
|
11
|
+
s.homepage = ""
|
12
|
+
s.summary = %q{Crawl websites}
|
13
|
+
s.description = %q{Crawl websites}
|
14
|
+
|
15
|
+
s.rubyforge_project = "pba_crawler"
|
16
|
+
|
17
|
+
s.files = `git ls-files`.split("\n")
|
18
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
19
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
20
|
+
s.require_paths = ["lib"]
|
21
|
+
|
22
|
+
s.add_dependency('formtastic', '1.2.3')
|
23
|
+
s.add_dependency('nokogiri', '1.4.4')
|
24
|
+
s.add_dependency('mechanize', '1.0.0')
|
25
|
+
end
|
data/readme.textile
ADDED
metadata
ADDED
@@ -0,0 +1,107 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: pba_crawler
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease:
|
5
|
+
version: 0.0.4
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Pierre BASILE
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
|
13
|
+
date: 2011-03-22 00:00:00 +01:00
|
14
|
+
default_executable:
|
15
|
+
dependencies:
|
16
|
+
- !ruby/object:Gem::Dependency
|
17
|
+
name: formtastic
|
18
|
+
prerelease: false
|
19
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
20
|
+
none: false
|
21
|
+
requirements:
|
22
|
+
- - "="
|
23
|
+
- !ruby/object:Gem::Version
|
24
|
+
version: 1.2.3
|
25
|
+
type: :runtime
|
26
|
+
version_requirements: *id001
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: nokogiri
|
29
|
+
prerelease: false
|
30
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
31
|
+
none: false
|
32
|
+
requirements:
|
33
|
+
- - "="
|
34
|
+
- !ruby/object:Gem::Version
|
35
|
+
version: 1.4.4
|
36
|
+
type: :runtime
|
37
|
+
version_requirements: *id002
|
38
|
+
- !ruby/object:Gem::Dependency
|
39
|
+
name: mechanize
|
40
|
+
prerelease: false
|
41
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
42
|
+
none: false
|
43
|
+
requirements:
|
44
|
+
- - "="
|
45
|
+
- !ruby/object:Gem::Version
|
46
|
+
version: 1.0.0
|
47
|
+
type: :runtime
|
48
|
+
version_requirements: *id003
|
49
|
+
description: Crawl websites
|
50
|
+
email:
|
51
|
+
- pierre.basile@gmail.com
|
52
|
+
executables: []
|
53
|
+
|
54
|
+
extensions: []
|
55
|
+
|
56
|
+
extra_rdoc_files: []
|
57
|
+
|
58
|
+
files:
|
59
|
+
- .gitignore
|
60
|
+
- Gemfile
|
61
|
+
- Rakefile
|
62
|
+
- app/controllers/proxies_controller.rb
|
63
|
+
- app/models/proxy.rb
|
64
|
+
- app/views/proxies/_form.html.erb
|
65
|
+
- app/views/proxies/edit.html.erb
|
66
|
+
- app/views/proxies/index.html.erb
|
67
|
+
- app/views/proxies/new.html.erb
|
68
|
+
- config/routes.rb
|
69
|
+
- db/migrate/20110321122537_create_proxy.rb
|
70
|
+
- lib/generators/crawler/install/install_generator.rb
|
71
|
+
- lib/pba_crawler.rb
|
72
|
+
- lib/pba_crawler/file_utilz.rb
|
73
|
+
- lib/pba_crawler/generic.rb
|
74
|
+
- lib/pba_crawler/version.rb
|
75
|
+
- lib/pba_crawler_engine.rb
|
76
|
+
- pba_crawler.gemspec
|
77
|
+
- readme.textile
|
78
|
+
has_rdoc: true
|
79
|
+
homepage: ""
|
80
|
+
licenses: []
|
81
|
+
|
82
|
+
post_install_message:
|
83
|
+
rdoc_options: []
|
84
|
+
|
85
|
+
require_paths:
|
86
|
+
- lib
|
87
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
88
|
+
none: false
|
89
|
+
requirements:
|
90
|
+
- - ">="
|
91
|
+
- !ruby/object:Gem::Version
|
92
|
+
version: "0"
|
93
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
94
|
+
none: false
|
95
|
+
requirements:
|
96
|
+
- - ">="
|
97
|
+
- !ruby/object:Gem::Version
|
98
|
+
version: "0"
|
99
|
+
requirements: []
|
100
|
+
|
101
|
+
rubyforge_project: pba_crawler
|
102
|
+
rubygems_version: 1.6.2
|
103
|
+
signing_key:
|
104
|
+
specification_version: 3
|
105
|
+
summary: Crawl websites
|
106
|
+
test_files: []
|
107
|
+
|