url-vi0lence 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/README ADDED
@@ -0,0 +1 @@
1
+ This is for ripping shit off.
@@ -0,0 +1,2 @@
1
+ #! /usr/bin/env ruby
2
+ puts "wee"
@@ -0,0 +1,24 @@
1
+ require "rubygems"
2
+ require "mechanize"
3
+ require "fileutils"
4
+ require "uri"
5
+
6
+ begin
7
+ require "ruby-debug"
8
+ Debugger.start
9
+ Debugger.settings[:autoeval] = true if Debugger.respond_to?(:settings)
10
+ rescue LoadError
11
+ puts "You need to install ruby-debug to run the server in debugging mode. With gems, use 'gem install ruby-debug'"
12
+ exit
13
+ end
14
+
15
+ Dir.glob("./lib/url_vi0lence/**/*.rb").each do |file|
16
+ require file
17
+ end
18
+
19
+ activate_samples = {
20
+ "sg" => false,
21
+ "gmail" => false
22
+ }.each do |file,do_start|
23
+ require "./scrapers/#{file}_scraper.rb" if do_start
24
+ end
@@ -0,0 +1,152 @@
1
+ class PageScraper
2
+
3
+ =begin
4
+ # initialize
5
+ #
6
+ # parameters
7
+ # === user_agent <String> User agent alias (Google it.) (Default: Mac Firefox)
8
+ # === redirect <Boolean> Should redirects be followed (Default: true)
9
+ # === use_cookies <Boolean> Should cookies be used (Default: true)
10
+ # === cookie_jar <String> Path to YAML cookie jar (Default: ./dump/cookies.yml)
11
+ =end
12
+ def initialize(config={})
13
+ config = {
14
+ :user_agent => "Mac FireFox",
15
+ :redirect => true,
16
+ :use_cookies=> true,
17
+ :cookie_jar => "#{PageScraper.dump}/cookies.yml"
18
+ }.merge(config)
19
+
20
+ @agent = WWW::Mechanize.new
21
+ @agent.user_agent_alias = config[:user_agent]
22
+ @agent.redirect_ok = config[:redirect]
23
+
24
+ if config[:use_cookies]
25
+ @agent.cookie_jar.save_as(config[:cookie_jar])
26
+ end
27
+ end
28
+
29
+ =begin
30
+ # login
31
+ #
32
+ # parameters
33
+ # = username <String> Username
34
+ # = password <String> Password
35
+ # = url <String> Login page URL
36
+ # = *args <Array>
37
+ # === un_field <String> Username HTML field name (Default: username)
38
+ # === pw_field <String> Password HTML field name (Default: password)
39
+ # === form_id <String|Fixnum> Form HTML name or the sequential number of the form in the page
40
+ # = block <Proc> Additional actions to do after login
41
+ # => Block should return Page or nil
42
+ =end
43
+ def login(username,password,url,*args,&block)
44
+ un_field = args[0] || "username"
45
+ pw_field = args[1] || "password"
46
+ form_id = args[2] || 0 #First form
47
+
48
+ current_page(@agent.get(url))
49
+
50
+ if form_id.is_a? Fixnum
51
+ form = @page.forms[form_id]
52
+ else #form_id.is_a? String
53
+ form = @page.forms.find{|f| f.name == form_id.to_s}
54
+ end
55
+
56
+ form.fields.find {|f| f.name == un_field}.value = username
57
+ form.fields.find {|f| f.name == pw_field}.value = password
58
+
59
+ current_page(@agent.submit(form))
60
+
61
+ current_page(block.call({
62
+ :username => username,
63
+ :password => password,
64
+ :un_field => un_field,
65
+ :pw_field => pw_field,
66
+ :form_id => form_id,
67
+ :agent => @agent,
68
+ :page => @page
69
+ })) if block_given?
70
+
71
+ return current_page
72
+ rescue Exception => e
73
+ puts "There was an exception during #login:\n#{e}"
74
+ Kernel.exit
75
+ end
76
+
77
+ =begin
78
+ # current_page - Current page Mechanize is on
79
+ #
80
+ # parameters
81
+ # new_page <Mechanize::Page> Page to set current page to (Default nil)
82
+ =end
83
+ def current_page(new_page=nil)
84
+ @page = new_page unless new_page.nil?
85
+ @page
86
+ end
87
+
88
+ =begin
89
+ # parse - Executes a block against the site
90
+ #
91
+ # parameters
92
+ # = pages <Array> List of urls to execute the block against (Default = current_page)
93
+ # = block <Proc> Action that should be performed
94
+ # => Access will be available to agent and page
95
+ # => Block should return Page || nil
96
+ =end
97
+ def parse(pages=nil,&block)
98
+ pages = [current_page] if pages.nil?
99
+
100
+ #Turn strings into Mechanize::Page
101
+ pages.collect! do |page|
102
+ if page.is_a? String
103
+ page = @agent.get(page)
104
+ end
105
+ end
106
+ debugger
107
+ pages.collect {|page|
108
+ current_page(block.call(page,@agent))
109
+ }
110
+ end
111
+
112
+ =begin
113
+ # filter_links - Filters links based on a regex match
114
+ #
115
+ # parameters
116
+ # = match_pattern <Regex> Pattern of links to keep
117
+ #TODO, Im sure there is a cleaner implementation
118
+ =end
119
+ def filter_links(match_pattern)
120
+ links = []
121
+ current_page.links.each do |link|
122
+ if link.uri.to_s =~ match_pattern
123
+ links.push(link)
124
+ end
125
+ end
126
+
127
+ @links.uniq!
128
+ rescue Exception => e
129
+ #Nothing!
130
+ ensure
131
+ links || []
132
+ end
133
+
134
+ class << self
135
+ def dump
136
+ @@dump = "./dump"
137
+ FileUtils.mkdir_p(@@dump)
138
+ @@dump
139
+ end
140
+
141
+ private
142
+ def children
143
+ @@children ||= []
144
+ end
145
+
146
+ def inherited(klass)
147
+ super
148
+ ensure
149
+ children << klass
150
+ end
151
+ end
152
+ end
@@ -0,0 +1,32 @@
1
+ class GmailScraper < PageScraper
2
+ NewInboxMessages = Proc.new {|page, agent|
3
+ page.search("//tr[@bgcolor='#ffffff']") do |row|
4
+ from, subject = *row.search("//b/text()")
5
+ url = page.uri.to_s.sub(/ui.*$/, row.search("//a").first.attributes["href"])
6
+ puts "From: #{from}\nSubject: #{subject}\nLink: #{url}\n\n"
7
+ end
8
+ }
9
+ end
10
+
11
+ gmail = GmailScraper.new
12
+
13
+ gmail_u = ""
14
+ gmail_p = ""
15
+
16
+ gmail.login gmail_u,gmail_p,"http://www.gmail.com","Email","Passwd" do |info|
17
+ #This tells Gmail that we want to use the basic, no-js version
18
+ info[:agent].get info[:page].uri.to_s.sub(/\?.*$/, "?ui=html&zy=n")
19
+ end
20
+
21
+
22
+ #you could pass it a block or a proc
23
+
24
+ #gmail.parse do |page, agent|
25
+ # page.search("//tr[@bgcolor='#ffffff']") do |row|
26
+ # from, subject = *row.search("//b/text()")
27
+ # url = page.uri.to_s.sub(/ui.*$/, row.search("//a").first.attributes["href"])
28
+ # puts "From: #{from}\nSubject: #{subject}\nLink: #{url}\n\n"
29
+ # end
30
+ #end
31
+
32
+ gmail.parse &GmailScraper::NewInboxMessages
@@ -0,0 +1,53 @@
1
+ class SgScraper < PageScraper
2
+ ImageGanker = lambda { |page, agent|
3
+ begin
4
+ hrefs = page.links.map { |m|
5
+ m.href
6
+ }.select { |u|
7
+ u =~ /[0-9]+.jpg/
8
+ } #just jpgs with numbers in the name
9
+
10
+ create_folder = true
11
+
12
+ hrefs.each { |image|
13
+ image_name = image.split("/")
14
+
15
+ #Folder name = GirlName/PhotoSet
16
+ folder = "#{SgScraper.dump}/sg/#{image_name[-4]}/#{image_name[-2]}"
17
+
18
+ if create_folder
19
+ FileUtils.mkdir_p(folder)
20
+ end
21
+
22
+ filename = "#{folder}/#{image_name[-1]}"
23
+ puts "Saving #{image} as #{filename}"
24
+ agent.get(image).save_as(filename)
25
+
26
+ create_folder = false
27
+ }
28
+ rescue Exception => e
29
+ puts e
30
+ puts 'Failed to get a file boo hoo'
31
+ end
32
+ }
33
+
34
+ end
35
+
36
+
37
+ sg = SgScraper.new
38
+
39
+ sg_u = ""
40
+ sg_p = ""
41
+
42
+ sg.login sg_u,sg_p,"http://www.suicidegirls.com" do |info|
43
+ #Go to a particular page after login
44
+ info[:agent].get "http://suicidegirls.com/girls/Ren/photos/Mars+Attacks/"
45
+ end
46
+
47
+ urls=[
48
+ "http://suicidegirls.com/girls/Ren/photos/Forest+Tea/",
49
+ "http://suicidegirls.com/girls/Gatsby/photos/Trinkets/",
50
+ "http://suicidegirls.com/members/Gatsby/albums/site/3578/"
51
+ ]
52
+ sg.parse(urls, &SgScraper::ImageGanker)
53
+
metadata ADDED
@@ -0,0 +1,59 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: url-vi0lence
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Cory ODaniel
8
+ autorequire: url_vi0lence
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2008-04-29 00:00:00 -07:00
13
+ default_executable:
14
+ dependencies: []
15
+
16
+ description:
17
+ email: urlviolence@coryodaniel.com
18
+ executables: []
19
+
20
+ extensions: []
21
+
22
+ extra_rdoc_files:
23
+ - README
24
+ files:
25
+ - bin/url-vi0lence
26
+ - lib/url-vi0lence.rb
27
+ - lib/url_vi0lence
28
+ - lib/url_vi0lence/page_scraper.rb
29
+ - scrapers/gmail_scraper.rb
30
+ - scrapers/sg_scraper.rb
31
+ - README
32
+ has_rdoc: true
33
+ homepage: url-vi0lence.rubyforge.com
34
+ post_install_message:
35
+ rdoc_options: []
36
+
37
+ require_paths:
38
+ - lib
39
+ required_ruby_version: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - ">="
42
+ - !ruby/object:Gem::Version
43
+ version: "0"
44
+ version:
45
+ required_rubygems_version: !ruby/object:Gem::Requirement
46
+ requirements:
47
+ - - ">="
48
+ - !ruby/object:Gem::Version
49
+ version: "0"
50
+ version:
51
+ requirements: []
52
+
53
+ rubyforge_project:
54
+ rubygems_version: 1.0.1
55
+ signing_key:
56
+ specification_version: 2
57
+ summary: Ripping off other peoples hard work, or automating testing whatev. Built on Mechanize
58
+ test_files: []
59
+