url-vi0lence 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README +1 -0
- data/bin/url-vi0lence +2 -0
- data/lib/url-vi0lence.rb +24 -0
- data/lib/url_vi0lence/page_scraper.rb +152 -0
- data/scrapers/gmail_scraper.rb +32 -0
- data/scrapers/sg_scraper.rb +53 -0
- metadata +59 -0
data/README
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
This is for ripping shit off.
|
data/bin/url-vi0lence
ADDED
data/lib/url-vi0lence.rb
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
require "rubygems"
|
2
|
+
require "mechanize"
|
3
|
+
require "fileutils"
|
4
|
+
require "uri"
|
5
|
+
|
6
|
+
begin
|
7
|
+
require "ruby-debug"
|
8
|
+
Debugger.start
|
9
|
+
Debugger.settings[:autoeval] = true if Debugger.respond_to?(:settings)
|
10
|
+
rescue LoadError
|
11
|
+
puts "You need to install ruby-debug to run the server in debugging mode. With gems, use 'gem install ruby-debug'"
|
12
|
+
exit
|
13
|
+
end
|
14
|
+
|
15
|
+
Dir.glob("./lib/url_vi0lence/**/*.rb").each do |file|
|
16
|
+
require file
|
17
|
+
end
|
18
|
+
|
19
|
+
activate_samples = {
|
20
|
+
"sg" => false,
|
21
|
+
"gmail" => false
|
22
|
+
}.each do |file,do_start|
|
23
|
+
require "./scrapers/#{file}_scraper.rb" if do_start
|
24
|
+
end
|
@@ -0,0 +1,152 @@
|
|
1
|
+
class PageScraper
|
2
|
+
|
3
|
+
=begin
|
4
|
+
# initialize
|
5
|
+
#
|
6
|
+
# parameters
|
7
|
+
# === user_agent <String> User agent alias (Google it.) (Default: Mac Firefox)
|
8
|
+
# === redirect <Boolean> Should redirects be followed (Default: true)
|
9
|
+
# === use_cookies <Boolean> Should cookies be used (Default: true)
|
10
|
+
# === cookie_jar <String> Path to YAML cookie jar (Default: ./dump/cookies.yml)
|
11
|
+
=end
|
12
|
+
def initialize(config={})
|
13
|
+
config = {
|
14
|
+
:user_agent => "Mac FireFox",
|
15
|
+
:redirect => true,
|
16
|
+
:use_cookies=> true,
|
17
|
+
:cookie_jar => "#{PageScraper.dump}/cookies.yml"
|
18
|
+
}.merge(config)
|
19
|
+
|
20
|
+
@agent = WWW::Mechanize.new
|
21
|
+
@agent.user_agent_alias = config[:user_agent]
|
22
|
+
@agent.redirect_ok = config[:redirect]
|
23
|
+
|
24
|
+
if config[:use_cookies]
|
25
|
+
@agent.cookie_jar.save_as(config[:cookie_jar])
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
=begin
|
30
|
+
# login
|
31
|
+
#
|
32
|
+
# parameters
|
33
|
+
# = username <String> Username
|
34
|
+
# = password <String> Password
|
35
|
+
# = url <String> Login page URL
|
36
|
+
# = *args <Array>
|
37
|
+
# === un_field <String> Username HTML field name (Default: username)
|
38
|
+
# === pw_field <String> Password HTML field name (Default: password)
|
39
|
+
# === form_id <String|Fixnum> Form HTML name or the sequential number of the form in the page
|
40
|
+
# = block <Proc> Additional actions to do after login
|
41
|
+
# => Block should return Page or nil
|
42
|
+
=end
|
43
|
+
def login(username,password,url,*args,&block)
|
44
|
+
un_field = args[0] || "username"
|
45
|
+
pw_field = args[1] || "password"
|
46
|
+
form_id = args[2] || 0 #First form
|
47
|
+
|
48
|
+
current_page(@agent.get(url))
|
49
|
+
|
50
|
+
if form_id.is_a? Fixnum
|
51
|
+
form = @page.forms[form_id]
|
52
|
+
else #form_id.is_a? String
|
53
|
+
form = @page.forms.find{|f| f.name == form_id.to_s}
|
54
|
+
end
|
55
|
+
|
56
|
+
form.fields.find {|f| f.name == un_field}.value = username
|
57
|
+
form.fields.find {|f| f.name == pw_field}.value = password
|
58
|
+
|
59
|
+
current_page(@agent.submit(form))
|
60
|
+
|
61
|
+
current_page(block.call({
|
62
|
+
:username => username,
|
63
|
+
:password => password,
|
64
|
+
:un_field => un_field,
|
65
|
+
:pw_field => pw_field,
|
66
|
+
:form_id => form_id,
|
67
|
+
:agent => @agent,
|
68
|
+
:page => @page
|
69
|
+
})) if block_given?
|
70
|
+
|
71
|
+
return current_page
|
72
|
+
rescue Exception => e
|
73
|
+
puts "There was an exception during #login:\n#{e}"
|
74
|
+
Kernel.exit
|
75
|
+
end
|
76
|
+
|
77
|
+
=begin
|
78
|
+
# current_page - Current page Mechanize is on
|
79
|
+
#
|
80
|
+
# parameters
|
81
|
+
# new_page <Mechanize::Page> Page to set current page to (Default nil)
|
82
|
+
=end
|
83
|
+
def current_page(new_page=nil)
|
84
|
+
@page = new_page unless new_page.nil?
|
85
|
+
@page
|
86
|
+
end
|
87
|
+
|
88
|
+
=begin
|
89
|
+
# parse - Executes a block against the site
|
90
|
+
#
|
91
|
+
# parameters
|
92
|
+
# = pages <Array> List of urls to execute the block against (Default = current_page)
|
93
|
+
# = block <Proc> Action that should be performed
|
94
|
+
# => Access will be available to agent and page
|
95
|
+
# => Block should return Page || nil
|
96
|
+
=end
|
97
|
+
def parse(pages=nil,&block)
|
98
|
+
pages = [current_page] if pages.nil?
|
99
|
+
|
100
|
+
#Turn strings into Mechanize::Page
|
101
|
+
pages.collect! do |page|
|
102
|
+
if page.is_a? String
|
103
|
+
page = @agent.get(page)
|
104
|
+
end
|
105
|
+
end
|
106
|
+
debugger
|
107
|
+
pages.collect {|page|
|
108
|
+
current_page(block.call(page,@agent))
|
109
|
+
}
|
110
|
+
end
|
111
|
+
|
112
|
+
=begin
|
113
|
+
# filter_links - Filters links based on a regex match
|
114
|
+
#
|
115
|
+
# parameters
|
116
|
+
# = match_pattern <Regex> Pattern of links to keep
|
117
|
+
#TODO, Im sure there is a cleaner implementation
|
118
|
+
=end
|
119
|
+
def filter_links(match_pattern)
|
120
|
+
links = []
|
121
|
+
current_page.links.each do |link|
|
122
|
+
if link.uri.to_s =~ match_pattern
|
123
|
+
links.push(link)
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
@links.uniq!
|
128
|
+
rescue Exception => e
|
129
|
+
#Nothing!
|
130
|
+
ensure
|
131
|
+
links || []
|
132
|
+
end
|
133
|
+
|
134
|
+
class << self
|
135
|
+
def dump
|
136
|
+
@@dump = "./dump"
|
137
|
+
FileUtils.mkdir_p(@@dump)
|
138
|
+
@@dump
|
139
|
+
end
|
140
|
+
|
141
|
+
private
|
142
|
+
def children
|
143
|
+
@@children ||= []
|
144
|
+
end
|
145
|
+
|
146
|
+
def inherited(klass)
|
147
|
+
super
|
148
|
+
ensure
|
149
|
+
children << klass
|
150
|
+
end
|
151
|
+
end
|
152
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
class GmailScraper < PageScraper
|
2
|
+
NewInboxMessages = Proc.new {|page, agent|
|
3
|
+
page.search("//tr[@bgcolor='#ffffff']") do |row|
|
4
|
+
from, subject = *row.search("//b/text()")
|
5
|
+
url = page.uri.to_s.sub(/ui.*$/, row.search("//a").first.attributes["href"])
|
6
|
+
puts "From: #{from}\nSubject: #{subject}\nLink: #{url}\n\n"
|
7
|
+
end
|
8
|
+
}
|
9
|
+
end
|
10
|
+
|
11
|
+
gmail = GmailScraper.new
|
12
|
+
|
13
|
+
gmail_u = ""
|
14
|
+
gmail_p = ""
|
15
|
+
|
16
|
+
gmail.login gmail_u,gmail_p,"http://www.gmail.com","Email","Passwd" do |info|
|
17
|
+
#This tells Gmail that we want to use the basic, no-js version
|
18
|
+
info[:agent].get info[:page].uri.to_s.sub(/\?.*$/, "?ui=html&zy=n")
|
19
|
+
end
|
20
|
+
|
21
|
+
|
22
|
+
#you could pass it a block or a proc
|
23
|
+
|
24
|
+
#gmail.parse do |page, agent|
|
25
|
+
# page.search("//tr[@bgcolor='#ffffff']") do |row|
|
26
|
+
# from, subject = *row.search("//b/text()")
|
27
|
+
# url = page.uri.to_s.sub(/ui.*$/, row.search("//a").first.attributes["href"])
|
28
|
+
# puts "From: #{from}\nSubject: #{subject}\nLink: #{url}\n\n"
|
29
|
+
# end
|
30
|
+
#end
|
31
|
+
|
32
|
+
gmail.parse &GmailScraper::NewInboxMessages
|
@@ -0,0 +1,53 @@
|
|
1
|
+
class SgScraper < PageScraper
|
2
|
+
ImageGanker = lambda { |page, agent|
|
3
|
+
begin
|
4
|
+
hrefs = page.links.map { |m|
|
5
|
+
m.href
|
6
|
+
}.select { |u|
|
7
|
+
u =~ /[0-9]+.jpg/
|
8
|
+
} #just jpgs with numbers in the name
|
9
|
+
|
10
|
+
create_folder = true
|
11
|
+
|
12
|
+
hrefs.each { |image|
|
13
|
+
image_name = image.split("/")
|
14
|
+
|
15
|
+
#Folder name = GirlName/PhotoSet
|
16
|
+
folder = "#{SgScraper.dump}/sg/#{image_name[-4]}/#{image_name[-2]}"
|
17
|
+
|
18
|
+
if create_folder
|
19
|
+
FileUtils.mkdir_p(folder)
|
20
|
+
end
|
21
|
+
|
22
|
+
filename = "#{folder}/#{image_name[-1]}"
|
23
|
+
puts "Saving #{image} as #{filename}"
|
24
|
+
agent.get(image).save_as(filename)
|
25
|
+
|
26
|
+
create_folder = false
|
27
|
+
}
|
28
|
+
rescue Exception => e
|
29
|
+
puts e
|
30
|
+
puts 'Failed to get a file boo hoo'
|
31
|
+
end
|
32
|
+
}
|
33
|
+
|
34
|
+
end
|
35
|
+
|
36
|
+
|
37
|
+
sg = SgScraper.new
|
38
|
+
|
39
|
+
sg_u = ""
|
40
|
+
sg_p = ""
|
41
|
+
|
42
|
+
sg.login sg_u,sg_p,"http://www.suicidegirls.com" do |info|
|
43
|
+
#Go to a particular page after login
|
44
|
+
info[:agent].get "http://suicidegirls.com/girls/Ren/photos/Mars+Attacks/"
|
45
|
+
end
|
46
|
+
|
47
|
+
urls=[
|
48
|
+
"http://suicidegirls.com/girls/Ren/photos/Forest+Tea/",
|
49
|
+
"http://suicidegirls.com/girls/Gatsby/photos/Trinkets/",
|
50
|
+
"http://suicidegirls.com/members/Gatsby/albums/site/3578/"
|
51
|
+
]
|
52
|
+
sg.parse(urls, &SgScraper::ImageGanker)
|
53
|
+
|
metadata
ADDED
@@ -0,0 +1,59 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: url-vi0lence
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Cory ODaniel
|
8
|
+
autorequire: url_vi0lence
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2008-04-29 00:00:00 -07:00
|
13
|
+
default_executable:
|
14
|
+
dependencies: []
|
15
|
+
|
16
|
+
description:
|
17
|
+
email: urlviolence@coryodaniel.com
|
18
|
+
executables: []
|
19
|
+
|
20
|
+
extensions: []
|
21
|
+
|
22
|
+
extra_rdoc_files:
|
23
|
+
- README
|
24
|
+
files:
|
25
|
+
- bin/url-vi0lence
|
26
|
+
- lib/url-vi0lence.rb
|
27
|
+
- lib/url_vi0lence
|
28
|
+
- lib/url_vi0lence/page_scraper.rb
|
29
|
+
- scrapers/gmail_scraper.rb
|
30
|
+
- scrapers/sg_scraper.rb
|
31
|
+
- README
|
32
|
+
has_rdoc: true
|
33
|
+
homepage: url-vi0lence.rubyforge.com
|
34
|
+
post_install_message:
|
35
|
+
rdoc_options: []
|
36
|
+
|
37
|
+
require_paths:
|
38
|
+
- lib
|
39
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
40
|
+
requirements:
|
41
|
+
- - ">="
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: "0"
|
44
|
+
version:
|
45
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
46
|
+
requirements:
|
47
|
+
- - ">="
|
48
|
+
- !ruby/object:Gem::Version
|
49
|
+
version: "0"
|
50
|
+
version:
|
51
|
+
requirements: []
|
52
|
+
|
53
|
+
rubyforge_project:
|
54
|
+
rubygems_version: 1.0.1
|
55
|
+
signing_key:
|
56
|
+
specification_version: 2
|
57
|
+
summary: Ripping off other peoples hard work, or automating testing whatev. Built on Mechanize
|
58
|
+
test_files: []
|
59
|
+
|