warc 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +15 -0
- data/.gitignore +20 -0
- data/Gemfile +4 -0
- data/LICENSE +22 -0
- data/README.md +29 -0
- data/Rakefile +2 -0
- data/bin/warc +4 -0
- data/lib/warc.rb +30 -0
- data/lib/warc/cli.rb +23 -0
- data/lib/warc/exceptions.rb +0 -0
- data/lib/warc/ext/net_http.rb +50 -0
- data/lib/warc/http.rb +35 -0
- data/lib/warc/parser.rb +24 -0
- data/lib/warc/proxy.rb +1 -0
- data/lib/warc/proxy/css/main.css +45 -0
- data/lib/warc/proxy/proxy.rb +85 -0
- data/lib/warc/proxy/views/index.erb +16 -0
- data/lib/warc/proxy/views/layout.erb +9 -0
- data/lib/warc/record.rb +59 -0
- data/lib/warc/record/header.rb +88 -0
- data/lib/warc/record/validator.rb +13 -0
- data/lib/warc/stream.rb +96 -0
- data/lib/warc/stream/gzip.rb +35 -0
- data/lib/warc/stream/plain.rb +23 -0
- data/lib/warc/utils/header_hash.rb +63 -0
- data/lib/warc/version.rb +3 -0
- data/spec/fixtures/arg.warc +267 -0
- data/spec/fixtures/criterion.warc +643 -0
- data/spec/fixtures/criterion.warc.gz +0 -0
- data/spec/fixtures/frg.warc +3617 -4
- data/spec/fixtures/frg.warc.gz +0 -0
- data/spec/fixtures/http_imdb +954 -0
- data/spec/spec_helper.rb +27 -0
- data/spec/warc/http_spec.rb +9 -0
- data/spec/warc/record/header_spec.rb +37 -0
- data/spec/warc/record_spec.rb +20 -0
- data/spec/warc/stream/gzip_spec.rb +46 -0
- data/spec/warc/stream/plain_spec.rb +41 -0
- data/spec/warc/stream_spec.rb +55 -0
- data/warc.gemspec +27 -0
- metadata +195 -0
checksums.yaml
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
---
|
2
|
+
!binary "U0hBMQ==":
|
3
|
+
metadata.gz: !binary |-
|
4
|
+
ZTQyNTg2YWM5ODBiODFlOGE4ZjczMjM0YmZmNWViZTk0MWM0OTdhZg==
|
5
|
+
data.tar.gz: !binary |-
|
6
|
+
YjJjZWU1YzFhZWRkNmI4M2QyZDkyNDE4N2M1YWVlYmE3NjU0NjVlZQ==
|
7
|
+
SHA512:
|
8
|
+
metadata.gz: !binary |-
|
9
|
+
MmEyNDRiMjNhMWYxNTc4OGI1OTRkYTAzYmRkNTIwNzcwMTc2Yjg2Zjc5MjAx
|
10
|
+
YjI4OWQ2NDYzMTQyNzkxNDcwNmUzZjBjZWU3Yzg2NzdiMmNiNGZiNDdjMTEz
|
11
|
+
NTJlODllYzljOGNkNzRmM2QzM2QxMWFkZGI2ZmJiZGY4YjMzOGY=
|
12
|
+
data.tar.gz: !binary |-
|
13
|
+
ODIzMDhiZWIxMTI5YTRiOWQ3NzQzMDc5MjM5MTU4MWJlNGE0OTE1ZjU1YTAz
|
14
|
+
ZjE5MDc5ZDhiYmRiMGQ4NzUwYTc4NWI2MDUxNzcxODM1NjY4NDAxMWI4NTc1
|
15
|
+
MDdkZGQwNzcyYTQxMjNkOTQyZGNiY2YyMmI5YmE5NjQzNWUwNmM=
|
data/.gitignore
ADDED
data/Gemfile
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2012 antoine
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
# Warc
|
2
|
+
|
3
|
+
TODO: Write a gem description
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Add this line to your application's Gemfile:
|
8
|
+
|
9
|
+
gem 'warc'
|
10
|
+
|
11
|
+
And then execute:
|
12
|
+
|
13
|
+
$ bundle
|
14
|
+
|
15
|
+
Or install it yourself as:
|
16
|
+
|
17
|
+
$ gem install warc
|
18
|
+
|
19
|
+
## Usage
|
20
|
+
|
21
|
+
TODO: Write usage instructions here
|
22
|
+
|
23
|
+
## Contributing
|
24
|
+
|
25
|
+
1. Fork it
|
26
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
27
|
+
3. Commit your changes (`git commit -am 'Added some feature'`)
|
28
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
29
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
data/bin/warc
ADDED
data/lib/warc.rb
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
require "warc/version"
|
2
|
+
|
3
|
+
# Utils
|
4
|
+
require "warc/utils/header_hash"
|
5
|
+
|
6
|
+
# Updated NET library (will be included in 2.0)
|
7
|
+
require "warc/ext/net_http"
|
8
|
+
|
9
|
+
# http tools
|
10
|
+
require "warc/http"
|
11
|
+
|
12
|
+
# Stream
|
13
|
+
require "warc/stream"
|
14
|
+
require "warc/stream/plain"
|
15
|
+
require "warc/stream/gzip"
|
16
|
+
|
17
|
+
# Record
|
18
|
+
require "warc/record"
|
19
|
+
require "warc/record/validator"
|
20
|
+
require "warc/record/header"
|
21
|
+
|
22
|
+
# Parser
|
23
|
+
require "warc/parser"
|
24
|
+
|
25
|
+
# Exception
|
26
|
+
require "warc/exceptions"
|
27
|
+
|
28
|
+
module Warc
|
29
|
+
|
30
|
+
end
|
data/lib/warc/cli.rb
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'warc'
|
2
|
+
require 'warc/proxy'
|
3
|
+
require 'thor'
|
4
|
+
|
5
|
+
module Warc
|
6
|
+
class CLI < Thor
|
7
|
+
desc "dump WARC_FILE", "Dump record headers from WARC_FILE"
|
8
|
+
method_option :offset, :default => 0, :type => :numeric
|
9
|
+
def dump(path)
|
10
|
+
w=Warc.open_stream(path)
|
11
|
+
puts "WARC filename\toffset\twarc-type\twarc-target-uri\twarc-record-id\tcontent-type\tcontent-length"
|
12
|
+
w.each(options[:offset]) do |record|
|
13
|
+
puts "#{path}\t#{record.offset}\t#{record.header['warc-type']}\t#{record.header['warc-target-uri']}\t#{record.header['warc-record-id']}\t#{record.header['content-type']}\t#{record.header.content_length}"
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
desc "replay WARC_FILE", "Start a HTTP proxy serving request from WARC_FILE. Dashboard available at http://warc/"
|
18
|
+
option :p, :default => 9292, :banner => "port"
|
19
|
+
def replay(warc)
|
20
|
+
Warc::Proxy::Replay.start(warc,options[:port])
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
File without changes
|
@@ -0,0 +1,50 @@
|
|
1
|
+
require 'net/http'
|
2
|
+
require 'net/https'
|
3
|
+
require 'stringio'
|
4
|
+
|
5
|
+
module Net
|
6
|
+
class BufferedIO
|
7
|
+
def initialize(io,debug_output = nil)
|
8
|
+
@read_timeout = 60
|
9
|
+
@rbuf = ''
|
10
|
+
@debug_output = debug_output
|
11
|
+
|
12
|
+
@io = case io
|
13
|
+
when Socket, OpenSSL::SSL::SSLSocket, StringIO, IO
|
14
|
+
io
|
15
|
+
when String
|
16
|
+
if !io.include?("\0") && File.exists?(io) && !File.directory?(io)
|
17
|
+
File.open(io, "r")
|
18
|
+
else
|
19
|
+
StringIO.new(io)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
raise "Unable to create fake socket from #{io}" unless @io
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
class HTTP
|
27
|
+
class << self
|
28
|
+
def socket_type_with_warc
|
29
|
+
FakeWeb::StubSocket
|
30
|
+
end
|
31
|
+
alias_method :socket_type, :socket_type_with_warc
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
module Warc
|
37
|
+
class StubSocket #:nodoc:
|
38
|
+
|
39
|
+
def initialize(*args)
|
40
|
+
end
|
41
|
+
|
42
|
+
def closed?
|
43
|
+
@closed ||= true
|
44
|
+
end
|
45
|
+
|
46
|
+
def readuntil(*args)
|
47
|
+
end
|
48
|
+
|
49
|
+
end
|
50
|
+
end
|
data/lib/warc/http.rb
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
module Warc
|
2
|
+
class HTTP
|
3
|
+
def self.get(uri)
|
4
|
+
url = URI(uri)
|
5
|
+
net_res = Net::HTTP.get_response(url)
|
6
|
+
|
7
|
+
record = ::Warc::Record.new
|
8
|
+
record.header["WARC-Type"] = "response"
|
9
|
+
record.header.date = net_res.to_hash["date"][0]
|
10
|
+
record.header["WARC-Target-URI"] = url.to_s
|
11
|
+
record.header["Content-Type"] = "application/http;msgtype=response"
|
12
|
+
|
13
|
+
headers = String.new
|
14
|
+
headers << "HTTP/#{net_res.http_version} #{net_res.code} #{net_res.message}\r\n"
|
15
|
+
net_res.to_hash.each {|key,value| headers << "#{key}: #{value[0].to_s}\r\n"}
|
16
|
+
|
17
|
+
record.content = "#{headers}\r\n#{net_res.body}"
|
18
|
+
return record,net_res
|
19
|
+
end
|
20
|
+
|
21
|
+
def self.archive(uri,stream)
|
22
|
+
stream = case stream
|
23
|
+
when ::Warc::Stream
|
24
|
+
stream
|
25
|
+
when String
|
26
|
+
::Warc::Stream::Gzip.new(stream)
|
27
|
+
end
|
28
|
+
|
29
|
+
record,response = self.get(uri)
|
30
|
+
stream.write_record(record)
|
31
|
+
return response
|
32
|
+
end
|
33
|
+
|
34
|
+
end
|
35
|
+
end
|
data/lib/warc/parser.rb
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
module Warc
|
2
|
+
class Parser
|
3
|
+
def parse(stream)
|
4
|
+
# Find next item
|
5
|
+
loop do
|
6
|
+
offset = stream.tell
|
7
|
+
version_line = stream.readline
|
8
|
+
break if version_line.chomp("\r\n") == "WARC/1.0"
|
9
|
+
end
|
10
|
+
|
11
|
+
# Prepare to read headers
|
12
|
+
rec = Warc::Record.new
|
13
|
+
|
14
|
+
while m = /^(.*): (.*)/.match(stream.readline)
|
15
|
+
rec.header[m.captures[0]] = m.captures[1].chomp("\r")
|
16
|
+
end
|
17
|
+
|
18
|
+
rec.content = stream.read(rec.header.content_length)
|
19
|
+
|
20
|
+
#stream.seek(rec.header["content-length"].to_i,IO::SEEK_CUR)
|
21
|
+
return rec
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
data/lib/warc/proxy.rb
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require 'warc/proxy/proxy'
|
@@ -0,0 +1,45 @@
|
|
1
|
+
body {
|
2
|
+
font-family: Helvetica;
|
3
|
+
}
|
4
|
+
|
5
|
+
h1 {
|
6
|
+
font-size: 40px;
|
7
|
+
}
|
8
|
+
|
9
|
+
h2 {
|
10
|
+
font-size:20px;
|
11
|
+
margin:10px;
|
12
|
+
margin-top:25px;
|
13
|
+
}
|
14
|
+
|
15
|
+
ul {
|
16
|
+
list-style: none;
|
17
|
+
font-size:12px;
|
18
|
+
}
|
19
|
+
|
20
|
+
#wrapper {
|
21
|
+
width:600px;
|
22
|
+
margin:0px auto;
|
23
|
+
}
|
24
|
+
|
25
|
+
#editor {
|
26
|
+
|
27
|
+
}
|
28
|
+
|
29
|
+
#header {
|
30
|
+
background-color: black;
|
31
|
+
color: white;
|
32
|
+
height: 300px;
|
33
|
+
border-radius:200px;
|
34
|
+
-moz-border-radius:200px;
|
35
|
+
-webkit-border-radius: 200px;
|
36
|
+
width: 300px;
|
37
|
+
padding:25px;
|
38
|
+
margin:0 auto;
|
39
|
+
text-align:center;
|
40
|
+
}
|
41
|
+
|
42
|
+
iframe {
|
43
|
+
width:100%;
|
44
|
+
height:400px;
|
45
|
+
}
|
@@ -0,0 +1,85 @@
|
|
1
|
+
require 'rack'
|
2
|
+
require 'sinatra/base'
|
3
|
+
|
4
|
+
module Warc
|
5
|
+
module Proxy
|
6
|
+
class Replay < Sinatra::Base
|
7
|
+
set :public_dir, File.expand_path('..', __FILE__) # set up the static dir (with images/js/css inside)
|
8
|
+
set :views, File.expand_path('../views', __FILE__) # set up the views dir
|
9
|
+
|
10
|
+
disable :protection
|
11
|
+
|
12
|
+
before do
|
13
|
+
headers["Access-ConDrol-Allow-Origin"] = "*"
|
14
|
+
end
|
15
|
+
|
16
|
+
get "/" do
|
17
|
+
@size = @index.size
|
18
|
+
erb :index
|
19
|
+
end
|
20
|
+
|
21
|
+
get "/record/*" do
|
22
|
+
id = params[:splat].join('/')
|
23
|
+
record = @index.record
|
24
|
+
@@session[request.ip] = Time.parse(record.header["WARC-Date"])
|
25
|
+
redirect to(record.header["WARC-Target-URI"])
|
26
|
+
end
|
27
|
+
|
28
|
+
def initialize(app=nil,warc=nil)
|
29
|
+
super(app)
|
30
|
+
@warc = ::Warc.open_stream(warc)
|
31
|
+
@index = {}
|
32
|
+
puts "Building index"
|
33
|
+
@warc.each do |record|
|
34
|
+
if record.header["warc-type"] == "response"
|
35
|
+
@index[record.header.uri] = record.offset
|
36
|
+
end
|
37
|
+
end
|
38
|
+
puts "Indexing done"
|
39
|
+
end
|
40
|
+
|
41
|
+
def call(env)
|
42
|
+
# Send to Sinatra app
|
43
|
+
if env["HTTP_HOST"] == "warc"
|
44
|
+
super(env)
|
45
|
+
# Or serve from archive
|
46
|
+
else
|
47
|
+
serve(env)
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
def serve(env)
|
52
|
+
uri = "http://#{env['HTTP_HOST']}#{env['REQUEST_URI']}"
|
53
|
+
if @index.key?(uri)
|
54
|
+
record = @warc.record(@index[uri])
|
55
|
+
return http_response(record)
|
56
|
+
else
|
57
|
+
return [404,{"Content-Type" => "text/html"},["not found"]]
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def http_response(record)
|
62
|
+
io = StringIO.new(record.content)
|
63
|
+
headers = {}
|
64
|
+
/^HTTP\/(\d\.\d) (\d++) (.*)/.match(io.readline)
|
65
|
+
code = $2
|
66
|
+
|
67
|
+
while m = /^(.*): (.*)/.match(io.readline)
|
68
|
+
headers[m.captures[0]] = m.captures[1].chomp("\r")
|
69
|
+
end
|
70
|
+
[code,headers,io]
|
71
|
+
end
|
72
|
+
|
73
|
+
def self.start(warc,port)
|
74
|
+
# Run the app!
|
75
|
+
app = Rack::Builder.new {
|
76
|
+
run Warc::Proxy::Replay.new(nil,warc)
|
77
|
+
}
|
78
|
+
puts "Starting proxy server on port #{port}"
|
79
|
+
::Rack::Server.start(:app => app,:Port => port,:server=>:thin)
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
|
84
|
+
end
|
85
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
<div id="wrapper">
|
2
|
+
<div id="header">
|
3
|
+
<h1>WARC</h1>
|
4
|
+
<h2>Web ARChive</h2>
|
5
|
+
<p>Currently serving <b><%= @size %> records</b></p>
|
6
|
+
</div>
|
7
|
+
|
8
|
+
<div>
|
9
|
+
<h2>Records:</h2>
|
10
|
+
<ul>
|
11
|
+
<% @index.collect do |uri,offset| %>
|
12
|
+
<li><a href="<%=uri%>"><%=uri%></a></li>
|
13
|
+
<% end %>
|
14
|
+
</ul>
|
15
|
+
</div>
|
16
|
+
</div>
|
data/lib/warc/record.rb
ADDED
@@ -0,0 +1,59 @@
|
|
1
|
+
module Warc
|
2
|
+
class Record
|
3
|
+
VERSION = "WARC/1.0"
|
4
|
+
|
5
|
+
attr_accessor :content, :offset
|
6
|
+
attr_reader :header
|
7
|
+
def initialize(h={},content=nil)
|
8
|
+
@content=content
|
9
|
+
case h
|
10
|
+
when Hash
|
11
|
+
@header = Header.new(self,h)
|
12
|
+
when WEBrick::HTTPResponse
|
13
|
+
@header = Header.new(self)
|
14
|
+
@header["WARC-Type"] = "response"
|
15
|
+
@header["WARC-Target-URI"] = h.request_uri.to_s
|
16
|
+
@header["Content-Type"] = "application/http;msgtype=response"
|
17
|
+
#@header["WARC-IP-Address"]
|
18
|
+
body,crfl = String.new,"\r\n"
|
19
|
+
body << h.status_line
|
20
|
+
h.header.each do |k,v|
|
21
|
+
body << "#{k}: #{v}" + crfl
|
22
|
+
end
|
23
|
+
body << crfl + h.body
|
24
|
+
self.content = body
|
25
|
+
self.header.block_digest
|
26
|
+
@header["WARC-Payload-Digest"] = self.header.compute_digest(h.body)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def to_http
|
31
|
+
if @header["Content-Type"] == "application/http;msgtype=response"
|
32
|
+
url = @header["WARC-Target-URI"]
|
33
|
+
socket = Net::BufferedIO.new(content)
|
34
|
+
r=Net::HTTPResponse.read_new(socket)
|
35
|
+
r.reading_body(socket,true) {}
|
36
|
+
return r
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
def dump_to(out)
|
41
|
+
#
|
42
|
+
# warc-file = 1*warc-record
|
43
|
+
# warc-record = header CRLF
|
44
|
+
# block CRLF CRLF
|
45
|
+
# header = version CRLF
|
46
|
+
# warc-fields
|
47
|
+
# version = "WARC/0.16" CRLF
|
48
|
+
# warc-fields = *named-field CRLF
|
49
|
+
# block = *OCTET
|
50
|
+
#
|
51
|
+
crfl = "\r\n"
|
52
|
+
|
53
|
+
out.write(VERSION + crfl)
|
54
|
+
out.write(self.header.to_s)
|
55
|
+
out.write(crfl)
|
56
|
+
out.write(self.content + crfl*2)
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|