warc 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,15 @@
1
+ ---
2
+ !binary "U0hBMQ==":
3
+ metadata.gz: !binary |-
4
+ ZTQyNTg2YWM5ODBiODFlOGE4ZjczMjM0YmZmNWViZTk0MWM0OTdhZg==
5
+ data.tar.gz: !binary |-
6
+ YjJjZWU1YzFhZWRkNmI4M2QyZDkyNDE4N2M1YWVlYmE3NjU0NjVlZQ==
7
+ SHA512:
8
+ metadata.gz: !binary |-
9
+ MmEyNDRiMjNhMWYxNTc4OGI1OTRkYTAzYmRkNTIwNzcwMTc2Yjg2Zjc5MjAx
10
+ YjI4OWQ2NDYzMTQyNzkxNDcwNmUzZjBjZWU3Yzg2NzdiMmNiNGZiNDdjMTEz
11
+ NTJlODllYzljOGNkNzRmM2QzM2QxMWFkZGI2ZmJiZGY4YjMzOGY=
12
+ data.tar.gz: !binary |-
13
+ ODIzMDhiZWIxMTI5YTRiOWQ3NzQzMDc5MjM5MTU4MWJlNGE0OTE1ZjU1YTAz
14
+ ZjE5MDc5ZDhiYmRiMGQ4NzUwYTc4NWI2MDUxNzcxODM1NjY4NDAxMWI4NTc1
15
+ MDdkZGQwNzcyYTQxMjNkOTQyZGNiY2YyMmI5YmE5NjQzNWUwNmM=
@@ -0,0 +1,20 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
18
+ .DS_Store
19
+ .project
20
+ .buildpath
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in warc.gemspec
4
+ gemspec
data/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2012 antoine
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,29 @@
1
+ # Warc
2
+
3
+ TODO: Write a gem description
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'warc'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install warc
18
+
19
+ ## Usage
20
+
21
+ TODO: Write usage instructions here
22
+
23
+ ## Contributing
24
+
25
+ 1. Fork it
26
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
27
+ 3. Commit your changes (`git commit -am 'Added some feature'`)
28
+ 4. Push to the branch (`git push origin my-new-feature`)
29
+ 5. Create new Pull Request
@@ -0,0 +1,2 @@
1
+ #!/usr/bin/env rake
2
+ require "bundler/gem_tasks"
@@ -0,0 +1,4 @@
1
+ #!/usr/bin/env ruby
2
+ require 'warc/cli'
3
+
4
+ Warc::CLI.start
@@ -0,0 +1,30 @@
1
+ require "warc/version"
2
+
3
+ # Utils
4
+ require "warc/utils/header_hash"
5
+
6
+ # Updated NET library (will be included in 2.0)
7
+ require "warc/ext/net_http"
8
+
9
+ # http tools
10
+ require "warc/http"
11
+
12
+ # Stream
13
+ require "warc/stream"
14
+ require "warc/stream/plain"
15
+ require "warc/stream/gzip"
16
+
17
+ # Record
18
+ require "warc/record"
19
+ require "warc/record/validator"
20
+ require "warc/record/header"
21
+
22
+ # Parser
23
+ require "warc/parser"
24
+
25
+ # Exception
26
+ require "warc/exceptions"
27
+
28
+ module Warc
29
+
30
+ end
@@ -0,0 +1,23 @@
1
+ require 'warc'
2
+ require 'warc/proxy'
3
+ require 'thor'
4
+
5
+ module Warc
6
+ class CLI < Thor
7
+ desc "dump WARC_FILE", "Dump record headers from WARC_FILE"
8
+ method_option :offset, :default => 0, :type => :numeric
9
+ def dump(path)
10
+ w=Warc.open_stream(path)
11
+ puts "WARC filename\toffset\twarc-type\twarc-target-uri\twarc-record-id\tcontent-type\tcontent-length"
12
+ w.each(options[:offset]) do |record|
13
+ puts "#{path}\t#{record.offset}\t#{record.header['warc-type']}\t#{record.header['warc-target-uri']}\t#{record.header['warc-record-id']}\t#{record.header['content-type']}\t#{record.header.content_length}"
14
+ end
15
+ end
16
+
17
+ desc "replay WARC_FILE", "Start a HTTP proxy serving request from WARC_FILE. Dashboard available at http://warc/"
18
+ option :p, :default => 9292, :banner => "port"
19
+ def replay(warc)
20
+ Warc::Proxy::Replay.start(warc,options[:port])
21
+ end
22
+ end
23
+ end
File without changes
@@ -0,0 +1,50 @@
1
+ require 'net/http'
2
+ require 'net/https'
3
+ require 'stringio'
4
+
5
+ module Net
6
+ class BufferedIO
7
+ def initialize(io,debug_output = nil)
8
+ @read_timeout = 60
9
+ @rbuf = ''
10
+ @debug_output = debug_output
11
+
12
+ @io = case io
13
+ when Socket, OpenSSL::SSL::SSLSocket, StringIO, IO
14
+ io
15
+ when String
16
+ if !io.include?("\0") && File.exists?(io) && !File.directory?(io)
17
+ File.open(io, "r")
18
+ else
19
+ StringIO.new(io)
20
+ end
21
+ end
22
+ raise "Unable to create fake socket from #{io}" unless @io
23
+ end
24
+ end
25
+
26
+ class HTTP
27
+ class << self
28
+ def socket_type_with_warc
29
+ FakeWeb::StubSocket
30
+ end
31
+ alias_method :socket_type, :socket_type_with_warc
32
+ end
33
+ end
34
+ end
35
+
36
+ module Warc
37
+ class StubSocket #:nodoc:
38
+
39
+ def initialize(*args)
40
+ end
41
+
42
+ def closed?
43
+ @closed ||= true
44
+ end
45
+
46
+ def readuntil(*args)
47
+ end
48
+
49
+ end
50
+ end
@@ -0,0 +1,35 @@
1
+ module Warc
2
+ class HTTP
3
+ def self.get(uri)
4
+ url = URI(uri)
5
+ net_res = Net::HTTP.get_response(url)
6
+
7
+ record = ::Warc::Record.new
8
+ record.header["WARC-Type"] = "response"
9
+ record.header.date = net_res.to_hash["date"][0]
10
+ record.header["WARC-Target-URI"] = url.to_s
11
+ record.header["Content-Type"] = "application/http;msgtype=response"
12
+
13
+ headers = String.new
14
+ headers << "HTTP/#{net_res.http_version} #{net_res.code} #{net_res.message}\r\n"
15
+ net_res.to_hash.each {|key,value| headers << "#{key}: #{value[0].to_s}\r\n"}
16
+
17
+ record.content = "#{headers}\r\n#{net_res.body}"
18
+ return record,net_res
19
+ end
20
+
21
+ def self.archive(uri,stream)
22
+ stream = case stream
23
+ when ::Warc::Stream
24
+ stream
25
+ when String
26
+ ::Warc::Stream::Gzip.new(stream)
27
+ end
28
+
29
+ record,response = self.get(uri)
30
+ stream.write_record(record)
31
+ return response
32
+ end
33
+
34
+ end
35
+ end
@@ -0,0 +1,24 @@
1
+ module Warc
2
+ class Parser
3
+ def parse(stream)
4
+ # Find next item
5
+ loop do
6
+ offset = stream.tell
7
+ version_line = stream.readline
8
+ break if version_line.chomp("\r\n") == "WARC/1.0"
9
+ end
10
+
11
+ # Prepare to read headers
12
+ rec = Warc::Record.new
13
+
14
+ while m = /^(.*): (.*)/.match(stream.readline)
15
+ rec.header[m.captures[0]] = m.captures[1].chomp("\r")
16
+ end
17
+
18
+ rec.content = stream.read(rec.header.content_length)
19
+
20
+ #stream.seek(rec.header["content-length"].to_i,IO::SEEK_CUR)
21
+ return rec
22
+ end
23
+ end
24
+ end
@@ -0,0 +1 @@
1
+ require 'warc/proxy/proxy'
@@ -0,0 +1,45 @@
1
+ body {
2
+ font-family: Helvetica;
3
+ }
4
+
5
+ h1 {
6
+ font-size: 40px;
7
+ }
8
+
9
+ h2 {
10
+ font-size:20px;
11
+ margin:10px;
12
+ margin-top:25px;
13
+ }
14
+
15
+ ul {
16
+ list-style: none;
17
+ font-size:12px;
18
+ }
19
+
20
+ #wrapper {
21
+ width:600px;
22
+ margin:0px auto;
23
+ }
24
+
25
+ #editor {
26
+
27
+ }
28
+
29
+ #header {
30
+ background-color: black;
31
+ color: white;
32
+ height: 300px;
33
+ border-radius:200px;
34
+ -moz-border-radius:200px;
35
+ -webkit-border-radius: 200px;
36
+ width: 300px;
37
+ padding:25px;
38
+ margin:0 auto;
39
+ text-align:center;
40
+ }
41
+
42
+ iframe {
43
+ width:100%;
44
+ height:400px;
45
+ }
@@ -0,0 +1,85 @@
1
+ require 'rack'
2
+ require 'sinatra/base'
3
+
4
+ module Warc
5
+ module Proxy
6
+ class Replay < Sinatra::Base
7
+ set :public_dir, File.expand_path('..', __FILE__) # set up the static dir (with images/js/css inside)
8
+ set :views, File.expand_path('../views', __FILE__) # set up the views dir
9
+
10
+ disable :protection
11
+
12
+ before do
13
+ headers["Access-ConDrol-Allow-Origin"] = "*"
14
+ end
15
+
16
+ get "/" do
17
+ @size = @index.size
18
+ erb :index
19
+ end
20
+
21
+ get "/record/*" do
22
+ id = params[:splat].join('/')
23
+ record = @index.record
24
+ @@session[request.ip] = Time.parse(record.header["WARC-Date"])
25
+ redirect to(record.header["WARC-Target-URI"])
26
+ end
27
+
28
+ def initialize(app=nil,warc=nil)
29
+ super(app)
30
+ @warc = ::Warc.open_stream(warc)
31
+ @index = {}
32
+ puts "Building index"
33
+ @warc.each do |record|
34
+ if record.header["warc-type"] == "response"
35
+ @index[record.header.uri] = record.offset
36
+ end
37
+ end
38
+ puts "Indexing done"
39
+ end
40
+
41
+ def call(env)
42
+ # Send to Sinatra app
43
+ if env["HTTP_HOST"] == "warc"
44
+ super(env)
45
+ # Or serve from archive
46
+ else
47
+ serve(env)
48
+ end
49
+ end
50
+
51
+ def serve(env)
52
+ uri = "http://#{env['HTTP_HOST']}#{env['REQUEST_URI']}"
53
+ if @index.key?(uri)
54
+ record = @warc.record(@index[uri])
55
+ return http_response(record)
56
+ else
57
+ return [404,{"Content-Type" => "text/html"},["not found"]]
58
+ end
59
+ end
60
+
61
+ def http_response(record)
62
+ io = StringIO.new(record.content)
63
+ headers = {}
64
+ /^HTTP\/(\d\.\d) (\d++) (.*)/.match(io.readline)
65
+ code = $2
66
+
67
+ while m = /^(.*): (.*)/.match(io.readline)
68
+ headers[m.captures[0]] = m.captures[1].chomp("\r")
69
+ end
70
+ [code,headers,io]
71
+ end
72
+
73
+ def self.start(warc,port)
74
+ # Run the app!
75
+ app = Rack::Builder.new {
76
+ run Warc::Proxy::Replay.new(nil,warc)
77
+ }
78
+ puts "Starting proxy server on port #{port}"
79
+ ::Rack::Server.start(:app => app,:Port => port,:server=>:thin)
80
+ end
81
+ end
82
+
83
+
84
+ end
85
+ end
@@ -0,0 +1,16 @@
1
+ <div id="wrapper">
2
+ <div id="header">
3
+ <h1>WARC</h1>
4
+ <h2>Web ARChive</h2>
5
+ <p>Currently serving <b><%= @size %> records</b></p>
6
+ </div>
7
+
8
+ <div>
9
+ <h2>Records:</h2>
10
+ <ul>
11
+ <% @index.collect do |uri,offset| %>
12
+ <li><a href="<%=uri%>"><%=uri%></a></li>
13
+ <% end %>
14
+ </ul>
15
+ </div>
16
+ </div>
@@ -0,0 +1,9 @@
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <link href="/css/main.css" rel="stylesheet" type="text/css">
5
+ </head>
6
+ <body>
7
+ <%= yield %>
8
+ </body>
9
+ </html>
@@ -0,0 +1,59 @@
1
+ module Warc
2
+ class Record
3
+ VERSION = "WARC/1.0"
4
+
5
+ attr_accessor :content, :offset
6
+ attr_reader :header
7
+ def initialize(h={},content=nil)
8
+ @content=content
9
+ case h
10
+ when Hash
11
+ @header = Header.new(self,h)
12
+ when WEBrick::HTTPResponse
13
+ @header = Header.new(self)
14
+ @header["WARC-Type"] = "response"
15
+ @header["WARC-Target-URI"] = h.request_uri.to_s
16
+ @header["Content-Type"] = "application/http;msgtype=response"
17
+ #@header["WARC-IP-Address"]
18
+ body,crfl = String.new,"\r\n"
19
+ body << h.status_line
20
+ h.header.each do |k,v|
21
+ body << "#{k}: #{v}" + crfl
22
+ end
23
+ body << crfl + h.body
24
+ self.content = body
25
+ self.header.block_digest
26
+ @header["WARC-Payload-Digest"] = self.header.compute_digest(h.body)
27
+ end
28
+ end
29
+
30
+ def to_http
31
+ if @header["Content-Type"] == "application/http;msgtype=response"
32
+ url = @header["WARC-Target-URI"]
33
+ socket = Net::BufferedIO.new(content)
34
+ r=Net::HTTPResponse.read_new(socket)
35
+ r.reading_body(socket,true) {}
36
+ return r
37
+ end
38
+ end
39
+
40
+ def dump_to(out)
41
+ #
42
+ # warc-file = 1*warc-record
43
+ # warc-record = header CRLF
44
+ # block CRLF CRLF
45
+ # header = version CRLF
46
+ # warc-fields
47
+ # version = "WARC/0.16" CRLF
48
+ # warc-fields = *named-field CRLF
49
+ # block = *OCTET
50
+ #
51
+ crfl = "\r\n"
52
+
53
+ out.write(VERSION + crfl)
54
+ out.write(self.header.to_s)
55
+ out.write(crfl)
56
+ out.write(self.content + crfl*2)
57
+ end
58
+ end
59
+ end