pba_crawler 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore ADDED
@@ -0,0 +1,4 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in crawler.gemspec
4
+ gemspec
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ require 'bundler'
2
+ Bundler::GemHelper.install_tasks
@@ -0,0 +1,39 @@
1
+ class ProxiesController < ApplicationController
2
+
3
+ def index
4
+ @proxies = Proxy.all
5
+ end
6
+
7
+ def new
8
+ @proxy = Proxy.new
9
+ end
10
+
11
+ def edit
12
+ @proxy = Proxy.find(params[:id])
13
+ end
14
+
15
+ def create
16
+ @proxy = Proxy.new(params[:proxy])
17
+ if @proxy.save
18
+ redirect_to(proxies_url, :notice => 'Proxy was successfully created.')
19
+ else
20
+ render :action => "new"
21
+ end
22
+ end
23
+
24
+ def update
25
+ @proxy = Proxy.find(params[:id])
26
+ if @proxy.update_attributes(params[:proxy])
27
+ redirect_to(proxies_url, :notice => 'Proxy was successfully updated.')
28
+ else
29
+ render :action => "edit"
30
+ end
31
+ end
32
+
33
+ def destroy
34
+ @proxy = Proxy.find(params[:id])
35
+ @proxy.destroy
36
+
37
+ redirect_to(proxies_url)
38
+ end
39
+ end
@@ -0,0 +1,3 @@
1
+ class Proxy < ActiveRecord::Base
2
+
3
+ end
@@ -0,0 +1,8 @@
1
+ <% semantic_form_for @proxy do |form| %>
2
+ <%= form.input :address %>
3
+ <%= form.input :port %>
4
+ <%= form.input :position %>
5
+ <% form.buttons do %>
6
+ <%= form.commit_button %>
7
+ <% end %>
8
+ <% end %>
@@ -0,0 +1,6 @@
1
+ <h1>Editing Proxy</h1>
2
+
3
+ <%= render 'form' %>
4
+
5
+ <%= link_to 'Show', @proxy %> |
6
+ <%= link_to 'Back', proxies_path %>
@@ -0,0 +1,24 @@
1
+ <h1>Listing proxies</h1>
2
+
3
+ <table>
4
+ <tr>
5
+ <th>Address</th>
6
+ <th>IP</th>
7
+ <th></th>
8
+ <th></th>
9
+ <th></th>
10
+ </tr>
11
+
12
+ <% @proxies.each do |proxy| %>
13
+ <tr>
14
+ <td><%= proxy.address %></td>
15
+ <td><%= proxy.port %></td>
16
+ <td><%= link_to 'Show', proxy %></td>
17
+ <td><%= link_to 'Edit', edit_proxy_path(proxy) %></td>
18
+ <td><%= link_to 'Destroy', proxy, :confirm => 'Are you sure?', :method => :delete %></td>
19
+ </tr>
20
+ <% end %>
21
+ </table>
22
+ <br />
23
+
24
+ <%= link_to 'New Proxy', new_proxy_path %>
@@ -0,0 +1,5 @@
1
+ <h1>New Proxy</h1>
2
+
3
+ <%= render 'form' %>
4
+
5
+ <%= link_to 'Back', proxies_path %>
data/config/routes.rb ADDED
@@ -0,0 +1,3 @@
1
+ ActionController::Routing::Routes.draw do
2
+ resources :proxies
3
+ end
@@ -0,0 +1,14 @@
1
+ class CreateProxy < ActiveRecord::Migration
2
+ def self.up
3
+ create_table :proxies do |t|
4
+ t.string :address
5
+ t.integer :port
6
+ t.datetime :last_request_at
7
+ t.integer :position
8
+ t.timestamps
9
+ end
10
+ end
11
+ def self.down
12
+ drop_table :proxies
13
+ end
14
+ end
@@ -0,0 +1,20 @@
1
+ # encoding: utf-8
2
+ require "crawler/file_utilz.rb"
3
+
4
+ module Crawler
5
+ # Copies database migration files
6
+ #
7
+ # @example
8
+ # $ rails generate crawler:install
9
+ #
10
+ # @todo Revisit in Rails 3.1 where public assets are treated differently
11
+ class InstallGenerator < Rails::Generators::Base
12
+ desc "Copies database migration files"
13
+
14
+ def copy_files
15
+ source = File.join(File.dirname(__FILE__),'..','..', '..', '..', 'db')
16
+ destination = File.join(Rails.root, 'db')
17
+ Crawler::FileUtilz.mirror_files(source, destination)
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,10 @@
1
+ require "pba_crawler_engine"
2
+
3
+
4
+ require 'nokogiri'
5
+ require 'mechanize'
6
+ require 'net/http'
7
+ require 'open-uri'
8
+
9
+
10
+ require 'pba_crawler/generic'
@@ -0,0 +1,73 @@
1
+ require 'fileutils'
2
+
3
+ module PbaCrawler
4
+ class FileUtilz
5
+ # A general purpose method to mirror a directory (+source+) into a destination
6
+ # directory, including all files and subdirectories. Files will not be mirrored
7
+ # if they are identical already (checked via FileUtils#identical?).
8
+ #
9
+ # Copyright (c) 2008 James Adam (The MIT License)
10
+
11
+ # added: do mirroring with backup saving turned on
12
+ def self.mirror_with_backup(source, destination)
13
+ self.mirror_files(source, destination, true)
14
+ end
15
+
16
+ def self.mirror_files(source, destination, create_backups = false)
17
+ # TODO: use Rake::FileList#pathmap?
18
+ if File.directory?(source)
19
+ source_files = Dir[source + "/**/*"]
20
+ source_dirs = source_files.select { |d| File.directory?(d) }
21
+ source_files -= source_dirs
22
+ elsif File.exist? source
23
+ source_dirs = []
24
+ source_files = [source]
25
+ source = File.dirname source
26
+ else
27
+ puts "Could not mirror #{source} - entity does not exist"
28
+ return
29
+ end
30
+
31
+ unless source_files.empty?
32
+ base_target_dir = File.join(destination)
33
+ base_target_dir = File.dirname base_target_dir unless File.directory? base_target_dir
34
+ FileUtils.mkdir_p(base_target_dir)
35
+ end
36
+
37
+ base_target_dir ||= File.join(destination)
38
+
39
+ source_dirs.each do |dir|
40
+ # strip down these paths so we have simple, relative paths we can
41
+ # add to the destination
42
+ target_dir = File.join(base_target_dir, dir.gsub(source, ''))
43
+ begin
44
+ FileUtils.mkdir_p(target_dir)
45
+ rescue Exception => e
46
+ raise "Could not create directory #{target_dir}: \n" + e
47
+ end
48
+ end
49
+
50
+ source_files.each do |file|
51
+ begin
52
+ target = File.join(base_target_dir, file.gsub(source, ''))
53
+ unless File.exist?(target) && self.same_contents(file, target)
54
+ # WAS FileUtils.identical?(file, target), but we want to test contents too
55
+ if create_backups && File.exist?(target)
56
+ File.rename(target, target + '~')
57
+ end
58
+ FileUtils.cp(file, target)
59
+ end
60
+ rescue Exception => e
61
+ raise "Could not copy #{file} to #{target}: \n" + e
62
+ end
63
+ end
64
+ end
65
+
66
+ # for windows users...
67
+ def self.same_contents(source,destination)
68
+ File.read(source) == File.read(destination)
69
+ end
70
+
71
+ end
72
+ end
73
+
@@ -0,0 +1,78 @@
1
+ module PbaCrawler
2
+ class Generic
3
+
4
+ #list of proxies address
5
+ @@proxies = ""
6
+ @@cursor = 0
7
+ @@proxy = nil
8
+
9
+ def initialize
10
+ #load proxy list
11
+ @@proxies = Proxy.order(:position)
12
+ end
13
+
14
+ def crawl_url(url)
15
+ agent = Mechanize.new { |a|
16
+ a.html_parser = Nokogiri::HTML
17
+ a.open_timeout = 20
18
+ a.read_timeout = 20
19
+ }
20
+ @@proxy = @@proxies.fetch(@@cursor)
21
+ agent.set_proxy(@@proxy.address,@@proxy.port)
22
+ agent.user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 6.1; nl; rv:1.9.2.13) Gecko/20101203 Firefox/3.6.13'
23
+
24
+ get_url(agent,url)
25
+ end
26
+
27
+ def get_url(agent,url)
28
+ begin
29
+ response = agent.get(url)
30
+ if response
31
+ @@proxy.update_attributes(:last_request_at => Time.now)
32
+ else
33
+ rescue_get_url(agent,url)
34
+ end
35
+ rescue
36
+ rescue_get_url(agent,url)
37
+ end
38
+ response
39
+ end
40
+
41
+ def rescue_get_url(agent,url)
42
+ if @@cursor + 1 < @@proxies.size
43
+ p "#{agent.proxy_addr} #{agent.proxy_port}"
44
+ @@cursor = @@cursor + 1
45
+ @@proxy = @@proxies.fetch(@@cursor)
46
+ agent.set_proxy(@@proxy.address,@@proxy.port)
47
+ response = get_url(agent,url)
48
+ end
49
+ end
50
+
51
+ def load_sample
52
+ list = [
53
+ ["101.0.6.87","31280"],
54
+ ["109.120.130.2","3128"],
55
+ ["62.215.5.69","8080"],
56
+ ["62.215.5.66","8080"],
57
+ ["212.33.199.19","80"],
58
+ ["196.1.70.201","80"],
59
+ ["194.87.32.233","80"],
60
+ ["94.125.27.20","8080"],
61
+ ["95.66.35.1","80"],
62
+ ["124.164.247.43","3128"],
63
+ ["186.225.98.17","3128"],
64
+ ["122.155.13.135","3128"],
65
+ ["148.81.130.19","80"],
66
+ ["202.129.58.68","80"],
67
+ ["97.100.143.20","27977"],
68
+ ["24.12.13.154","27977"],
69
+ ["200.129.180.61","80"]
70
+ ]
71
+ Proxy.delete_all
72
+ list.each do |add|
73
+ Proxy.create(:address => add[0], :port => add[1].to_i, :last_request_at => Time.now)
74
+ end
75
+ end
76
+
77
+ end
78
+ end
@@ -0,0 +1,3 @@
1
+ module PbaCrawler
2
+ VERSION = "0.0.4"
3
+ end
@@ -0,0 +1,5 @@
1
+ module PbaCrawlerEngine
2
+ class Engine < Rails::Engine
3
+ #engine_name :crawler_engine
4
+ end
5
+ end
@@ -0,0 +1,25 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "pba_crawler/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "pba_crawler"
7
+ s.version = PbaCrawler::VERSION
8
+ s.platform = Gem::Platform::RUBY
9
+ s.authors = ["Pierre BASILE"]
10
+ s.email = ["pierre.basile@gmail.com"]
11
+ s.homepage = ""
12
+ s.summary = %q{Crawl websites}
13
+ s.description = %q{Crawl websites}
14
+
15
+ s.rubyforge_project = "pba_crawler"
16
+
17
+ s.files = `git ls-files`.split("\n")
18
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
19
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
20
+ s.require_paths = ["lib"]
21
+
22
+ s.add_dependency('formtastic', '1.2.3')
23
+ s.add_dependency('nokogiri', '1.4.4')
24
+ s.add_dependency('mechanize', '1.0.0')
25
+ end
data/readme.textile ADDED
@@ -0,0 +1,16 @@
1
+ h1. Crawler Gem
2
+
3
+ h2. Installation
4
+
5
+ gem 'crawler-pba'
6
+
7
+
8
+ <pre>
9
+ rails g crawler:install
10
+ rake db:migrate
11
+ </pre>
12
+
13
+ h2. Use
14
+
15
+ c = Crawler::Generic.new
16
+ c.crawl_url("http://www.google.fr")
metadata ADDED
@@ -0,0 +1,107 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: pba_crawler
3
+ version: !ruby/object:Gem::Version
4
+ prerelease:
5
+ version: 0.0.4
6
+ platform: ruby
7
+ authors:
8
+ - Pierre BASILE
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+
13
+ date: 2011-03-22 00:00:00 +01:00
14
+ default_executable:
15
+ dependencies:
16
+ - !ruby/object:Gem::Dependency
17
+ name: formtastic
18
+ prerelease: false
19
+ requirement: &id001 !ruby/object:Gem::Requirement
20
+ none: false
21
+ requirements:
22
+ - - "="
23
+ - !ruby/object:Gem::Version
24
+ version: 1.2.3
25
+ type: :runtime
26
+ version_requirements: *id001
27
+ - !ruby/object:Gem::Dependency
28
+ name: nokogiri
29
+ prerelease: false
30
+ requirement: &id002 !ruby/object:Gem::Requirement
31
+ none: false
32
+ requirements:
33
+ - - "="
34
+ - !ruby/object:Gem::Version
35
+ version: 1.4.4
36
+ type: :runtime
37
+ version_requirements: *id002
38
+ - !ruby/object:Gem::Dependency
39
+ name: mechanize
40
+ prerelease: false
41
+ requirement: &id003 !ruby/object:Gem::Requirement
42
+ none: false
43
+ requirements:
44
+ - - "="
45
+ - !ruby/object:Gem::Version
46
+ version: 1.0.0
47
+ type: :runtime
48
+ version_requirements: *id003
49
+ description: Crawl websites
50
+ email:
51
+ - pierre.basile@gmail.com
52
+ executables: []
53
+
54
+ extensions: []
55
+
56
+ extra_rdoc_files: []
57
+
58
+ files:
59
+ - .gitignore
60
+ - Gemfile
61
+ - Rakefile
62
+ - app/controllers/proxies_controller.rb
63
+ - app/models/proxy.rb
64
+ - app/views/proxies/_form.html.erb
65
+ - app/views/proxies/edit.html.erb
66
+ - app/views/proxies/index.html.erb
67
+ - app/views/proxies/new.html.erb
68
+ - config/routes.rb
69
+ - db/migrate/20110321122537_create_proxy.rb
70
+ - lib/generators/crawler/install/install_generator.rb
71
+ - lib/pba_crawler.rb
72
+ - lib/pba_crawler/file_utilz.rb
73
+ - lib/pba_crawler/generic.rb
74
+ - lib/pba_crawler/version.rb
75
+ - lib/pba_crawler_engine.rb
76
+ - pba_crawler.gemspec
77
+ - readme.textile
78
+ has_rdoc: true
79
+ homepage: ""
80
+ licenses: []
81
+
82
+ post_install_message:
83
+ rdoc_options: []
84
+
85
+ require_paths:
86
+ - lib
87
+ required_ruby_version: !ruby/object:Gem::Requirement
88
+ none: false
89
+ requirements:
90
+ - - ">="
91
+ - !ruby/object:Gem::Version
92
+ version: "0"
93
+ required_rubygems_version: !ruby/object:Gem::Requirement
94
+ none: false
95
+ requirements:
96
+ - - ">="
97
+ - !ruby/object:Gem::Version
98
+ version: "0"
99
+ requirements: []
100
+
101
+ rubyforge_project: pba_crawler
102
+ rubygems_version: 1.6.2
103
+ signing_key:
104
+ specification_version: 3
105
+ summary: Crawl websites
106
+ test_files: []
107
+