warc 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +15 -0
- data/.gitignore +20 -0
- data/Gemfile +4 -0
- data/LICENSE +22 -0
- data/README.md +29 -0
- data/Rakefile +2 -0
- data/bin/warc +4 -0
- data/lib/warc.rb +30 -0
- data/lib/warc/cli.rb +23 -0
- data/lib/warc/exceptions.rb +0 -0
- data/lib/warc/ext/net_http.rb +50 -0
- data/lib/warc/http.rb +35 -0
- data/lib/warc/parser.rb +24 -0
- data/lib/warc/proxy.rb +1 -0
- data/lib/warc/proxy/css/main.css +45 -0
- data/lib/warc/proxy/proxy.rb +85 -0
- data/lib/warc/proxy/views/index.erb +16 -0
- data/lib/warc/proxy/views/layout.erb +9 -0
- data/lib/warc/record.rb +59 -0
- data/lib/warc/record/header.rb +88 -0
- data/lib/warc/record/validator.rb +13 -0
- data/lib/warc/stream.rb +96 -0
- data/lib/warc/stream/gzip.rb +35 -0
- data/lib/warc/stream/plain.rb +23 -0
- data/lib/warc/utils/header_hash.rb +63 -0
- data/lib/warc/version.rb +3 -0
- data/spec/fixtures/arg.warc +267 -0
- data/spec/fixtures/criterion.warc +643 -0
- data/spec/fixtures/criterion.warc.gz +0 -0
- data/spec/fixtures/frg.warc +3617 -4
- data/spec/fixtures/frg.warc.gz +0 -0
- data/spec/fixtures/http_imdb +954 -0
- data/spec/spec_helper.rb +27 -0
- data/spec/warc/http_spec.rb +9 -0
- data/spec/warc/record/header_spec.rb +37 -0
- data/spec/warc/record_spec.rb +20 -0
- data/spec/warc/stream/gzip_spec.rb +46 -0
- data/spec/warc/stream/plain_spec.rb +41 -0
- data/spec/warc/stream_spec.rb +55 -0
- data/warc.gemspec +27 -0
- metadata +195 -0
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
require 'warc'
|
2
|
+
|
3
|
+
def fixture(path)
|
4
|
+
File.expand_path(File.join(File.dirname(__FILE__),'fixtures',path))
|
5
|
+
end
|
6
|
+
|
7
|
+
def read_fixture(path)
|
8
|
+
File.read(fixture(path))
|
9
|
+
end
|
10
|
+
|
11
|
+
SAMPLES = {
|
12
|
+
"http://www.imdb.com/" => "http_imdb"
|
13
|
+
}
|
14
|
+
|
15
|
+
unless ENV['LIVE_TEST']
|
16
|
+
begin
|
17
|
+
require 'fakeweb'
|
18
|
+
|
19
|
+
FakeWeb.allow_net_connect = false
|
20
|
+
SAMPLES.each do |url,path|
|
21
|
+
FakeWeb.register_uri(:get,url,:response => read_fixture(path))
|
22
|
+
end
|
23
|
+
rescue LoadError
|
24
|
+
puts "Could not load FakeWeb, these tests will hit IMDB.com"
|
25
|
+
puts "You can run `gem install fakeweb` to stub out the responses."
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
require 'spec_helper.rb'
|
2
|
+
|
3
|
+
describe Warc::Record::Header do
|
4
|
+
context "when instansiated with some value" do
|
5
|
+
before(:each) do
|
6
|
+
@header=Warc::Record.new({
|
7
|
+
"WARC-Type"=> "response",
|
8
|
+
"WARC-Record-ID" => "<record-1>",
|
9
|
+
"WARC-Date" => "2000-01-02T03:04:05Z",
|
10
|
+
"Content-Length" => "10"
|
11
|
+
}).header
|
12
|
+
end
|
13
|
+
|
14
|
+
subject {@header}
|
15
|
+
|
16
|
+
its(:type) { should eq "response"}
|
17
|
+
it "should have attributes for mandatory fields" do
|
18
|
+
@header.type.should eq "response"
|
19
|
+
@header.record_id.should eq "<record-1>"
|
20
|
+
@header.date.should eq "2000-01-02T03:04:05Z"
|
21
|
+
@header.content_length.should eq 10
|
22
|
+
end
|
23
|
+
|
24
|
+
it "should be case-insensitive to field names" do
|
25
|
+
["WARC-Type", "warc-type", "WARC-TYPE"].each do |key|
|
26
|
+
@header[key].should eq "response"
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
context "when warc-content-length empty" do
|
31
|
+
before {@header.delete("content-length")}
|
32
|
+
it "should assume a content-lenght of 0" do
|
33
|
+
@header.content_length.should eq 0
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
require 'spec_helper.rb'
|
2
|
+
|
3
|
+
describe Warc::Record do
|
4
|
+
before(:each) do
|
5
|
+
@rec = Warc::Record.new
|
6
|
+
@rec.content = "asdf"
|
7
|
+
@rec.header.replace({
|
8
|
+
"WARC-Type"=> "response",
|
9
|
+
"WARC-Date" => "2000-01-02T03:04:05Z"
|
10
|
+
})
|
11
|
+
end
|
12
|
+
|
13
|
+
it "should have a header" do
|
14
|
+
@rec.respond_to?(:header).should eq true
|
15
|
+
end
|
16
|
+
|
17
|
+
it "should compute content-length" do
|
18
|
+
@rec.header.content_length.should eq(4)
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
require 'spec_helper.rb'
|
2
|
+
|
3
|
+
describe Warc::Stream::Gzip do
|
4
|
+
context "compressed file single entry" do
|
5
|
+
before(:each) do
|
6
|
+
@warc=Warc.open_stream(fixture('criterion.warc.gz'))
|
7
|
+
@record = @warc.first
|
8
|
+
end
|
9
|
+
|
10
|
+
it "should read the content" do
|
11
|
+
@record.content.length.should eq @record.header.content_length
|
12
|
+
end
|
13
|
+
|
14
|
+
it "should parse all headers" do
|
15
|
+
@record.header.length.should eq(7)
|
16
|
+
end
|
17
|
+
|
18
|
+
it "should return the headers as an hash" do
|
19
|
+
@record.header.should eq({
|
20
|
+
"WARC-Type" => "warcinfo",
|
21
|
+
"Content-Type" => "application/warc-fields",
|
22
|
+
"WARC-Date" => "2012-09-13T22:52:52Z",
|
23
|
+
"WARC-Record-ID" => "<urn:uuid:671787C3-3C00-4256-8C5C-386A4D8F7468>",
|
24
|
+
"WARC-Filename" => "criterion.warc.gz",
|
25
|
+
"WARC-Block-Digest" => "sha1:OX3R5RVY4LFQ6WIPTDCLTY3ABKWLXUBU",
|
26
|
+
"Content-Length" => "234"
|
27
|
+
})
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
31
|
+
|
32
|
+
context "compressed file mutliple entries" do
|
33
|
+
before(:each) do
|
34
|
+
@warc=Warc.open_stream(fixture('frg.warc.gz'))
|
35
|
+
end
|
36
|
+
|
37
|
+
it "should find all record" do
|
38
|
+
@warc.count.should eq 56
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
context "writing record" do
|
43
|
+
before(:each) do
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require 'spec_helper.rb'
|
2
|
+
|
3
|
+
describe Warc::Stream::Plain do
|
4
|
+
context "single entry of an uncompressed file" do
|
5
|
+
before(:each) do
|
6
|
+
@warc=Warc.open_stream(fixture('criterion.warc'))
|
7
|
+
@record = @warc.first
|
8
|
+
end
|
9
|
+
|
10
|
+
it "should parse all headers" do
|
11
|
+
@record.header.length.should eq(7)
|
12
|
+
end
|
13
|
+
|
14
|
+
it "should return the headers as an hash" do
|
15
|
+
@record.header.should eq({
|
16
|
+
"WARC-Type" => "warcinfo",
|
17
|
+
"Content-Type" => "application/warc-fields",
|
18
|
+
"WARC-Date" => "2012-09-13T22:53:20Z",
|
19
|
+
"WARC-Record-ID" => "<urn:uuid:CF5083F4-FEB1-4C63-B3CB-B450AB609875>",
|
20
|
+
"WARC-Filename" => "criterion.warc",
|
21
|
+
"WARC-Block-Digest" => "sha1:XGSXIX3L7RQGXFU6XJ32NCHXCKN6BBMK",
|
22
|
+
"Content-Length" => "258"
|
23
|
+
})
|
24
|
+
end
|
25
|
+
|
26
|
+
it "should read the content" do
|
27
|
+
@record.content.length.should eq @record.header.content_length
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
context "multiples entries" do
|
32
|
+
before(:each) do
|
33
|
+
@warc=Warc.open_stream(fixture('frg.warc'))
|
34
|
+
end
|
35
|
+
|
36
|
+
it "should find all record" do
|
37
|
+
@warc.count.should eq 56
|
38
|
+
end
|
39
|
+
|
40
|
+
end
|
41
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
require 'spec_helper.rb'
|
2
|
+
require 'fileutils'
|
3
|
+
|
4
|
+
describe Warc::Stream do
|
5
|
+
before(:each) do
|
6
|
+
@rec = Warc::Record.new
|
7
|
+
@rec.content = "asdf asdf"
|
8
|
+
@rec.header.replace({
|
9
|
+
"WARC-Type"=> "response",
|
10
|
+
"WARC-Date" => "2000-01-02T03:04:05Z",
|
11
|
+
})
|
12
|
+
header = @rec.header
|
13
|
+
end
|
14
|
+
|
15
|
+
it "can't be initialized" do
|
16
|
+
#s = Warc::Stream.new
|
17
|
+
end
|
18
|
+
|
19
|
+
it "should save to multiple files" do
|
20
|
+
s=Warc::Stream::Plain.new('/tmp/test',:max_filesize => 10*10**6)
|
21
|
+
100.times do
|
22
|
+
r = Warc::Record.new
|
23
|
+
r.content = "0" * (10**6)
|
24
|
+
r.header.replace({"WARC-Type"=> "response","WARC-Date" => "2000-01-02T03:04:05Z"})
|
25
|
+
s.write_record(r)
|
26
|
+
end
|
27
|
+
::File.exists?('/tmp/test.000010.warc').should eq(true)
|
28
|
+
FileUtils.rm Dir.glob('/tmp/test.*.warc')
|
29
|
+
end
|
30
|
+
|
31
|
+
it "should dump record to file" do
|
32
|
+
s = Warc::Stream::Plain.new('/tmp/test.plain')
|
33
|
+
s.write_record(@rec)
|
34
|
+
s.close
|
35
|
+
end
|
36
|
+
|
37
|
+
it "should dump gzipped to file" do
|
38
|
+
s = Warc::Stream::Gzip.new('/tmp/test.gzip')
|
39
|
+
s.write_record(@rec)
|
40
|
+
s.close
|
41
|
+
end
|
42
|
+
|
43
|
+
it "should find record" do
|
44
|
+
stream = ::Warc.open_stream(fixture('arg.warc'))
|
45
|
+
uri = "http://antoineroygobeil.com/"
|
46
|
+
record = stream.detect do |rec|
|
47
|
+
rec.header["warc-target-uri"] == uri && rec.header["warc-type"] == "response"
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
it "should read record at given offset" do
|
52
|
+
stream = ::Warc.open_stream(fixture('arg.warc'))
|
53
|
+
stream.record(8287).header.record_id.should eq("<urn:uuid:5D799C11-D46C-4AC8-B598-5DC9F4205C6E>")
|
54
|
+
end
|
55
|
+
end
|
data/warc.gemspec
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
require File.expand_path('../lib/warc/version', __FILE__)
|
3
|
+
|
4
|
+
Gem::Specification.new do |gem|
|
5
|
+
gem.authors = ["antoine"]
|
6
|
+
gem.email = ["roygobeil.antoine@gmail.com"]
|
7
|
+
#gem.description = %q{TODO: Write a gem description}
|
8
|
+
gem.summary = %q{warc is a pure ruby implementation of Web ARChive file reader and writer}
|
9
|
+
gem.homepage = ""
|
10
|
+
|
11
|
+
gem.license = "MIT"
|
12
|
+
|
13
|
+
gem.files = `git ls-files`.split($\)
|
14
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
15
|
+
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
16
|
+
gem.name = "warc"
|
17
|
+
gem.require_paths = ["lib"]
|
18
|
+
gem.version = Warc::VERSION
|
19
|
+
|
20
|
+
gem.add_dependency("uuid")
|
21
|
+
gem.add_dependency("activemodel")
|
22
|
+
gem.add_dependency("rack")
|
23
|
+
gem.add_dependency("thor")
|
24
|
+
gem.add_dependency("rack-contrib")
|
25
|
+
gem.add_dependency("sinatra")
|
26
|
+
gem.add_dependency("thin")
|
27
|
+
end
|
metadata
ADDED
@@ -0,0 +1,195 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: warc
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- antoine
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2013-10-24 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: uuid
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ! '>='
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ! '>='
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: activemodel
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ! '>='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ! '>='
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rack
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ! '>='
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ! '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: thor
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ! '>='
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: rack-contrib
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ! '>='
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ! '>='
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: sinatra
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ! '>='
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
type: :runtime
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - ! '>='
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: thin
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - ! '>='
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
type: :runtime
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - ! '>='
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0'
|
111
|
+
description:
|
112
|
+
email:
|
113
|
+
- roygobeil.antoine@gmail.com
|
114
|
+
executables:
|
115
|
+
- warc
|
116
|
+
extensions: []
|
117
|
+
extra_rdoc_files: []
|
118
|
+
files:
|
119
|
+
- .gitignore
|
120
|
+
- Gemfile
|
121
|
+
- LICENSE
|
122
|
+
- README.md
|
123
|
+
- Rakefile
|
124
|
+
- bin/warc
|
125
|
+
- lib/warc.rb
|
126
|
+
- lib/warc/cli.rb
|
127
|
+
- lib/warc/exceptions.rb
|
128
|
+
- lib/warc/ext/net_http.rb
|
129
|
+
- lib/warc/http.rb
|
130
|
+
- lib/warc/parser.rb
|
131
|
+
- lib/warc/proxy.rb
|
132
|
+
- lib/warc/proxy/css/main.css
|
133
|
+
- lib/warc/proxy/proxy.rb
|
134
|
+
- lib/warc/proxy/views/index.erb
|
135
|
+
- lib/warc/proxy/views/layout.erb
|
136
|
+
- lib/warc/record.rb
|
137
|
+
- lib/warc/record/header.rb
|
138
|
+
- lib/warc/record/validator.rb
|
139
|
+
- lib/warc/stream.rb
|
140
|
+
- lib/warc/stream/gzip.rb
|
141
|
+
- lib/warc/stream/plain.rb
|
142
|
+
- lib/warc/utils/header_hash.rb
|
143
|
+
- lib/warc/version.rb
|
144
|
+
- spec/fixtures/arg.warc
|
145
|
+
- spec/fixtures/criterion.warc
|
146
|
+
- spec/fixtures/criterion.warc.gz
|
147
|
+
- spec/fixtures/frg.warc
|
148
|
+
- spec/fixtures/frg.warc.gz
|
149
|
+
- spec/fixtures/http_imdb
|
150
|
+
- spec/spec_helper.rb
|
151
|
+
- spec/warc/http_spec.rb
|
152
|
+
- spec/warc/record/header_spec.rb
|
153
|
+
- spec/warc/record_spec.rb
|
154
|
+
- spec/warc/stream/gzip_spec.rb
|
155
|
+
- spec/warc/stream/plain_spec.rb
|
156
|
+
- spec/warc/stream_spec.rb
|
157
|
+
- warc.gemspec
|
158
|
+
homepage: ''
|
159
|
+
licenses:
|
160
|
+
- MIT
|
161
|
+
metadata: {}
|
162
|
+
post_install_message:
|
163
|
+
rdoc_options: []
|
164
|
+
require_paths:
|
165
|
+
- lib
|
166
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
167
|
+
requirements:
|
168
|
+
- - ! '>='
|
169
|
+
- !ruby/object:Gem::Version
|
170
|
+
version: '0'
|
171
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
172
|
+
requirements:
|
173
|
+
- - ! '>='
|
174
|
+
- !ruby/object:Gem::Version
|
175
|
+
version: '0'
|
176
|
+
requirements: []
|
177
|
+
rubyforge_project:
|
178
|
+
rubygems_version: 2.1.9
|
179
|
+
signing_key:
|
180
|
+
specification_version: 4
|
181
|
+
summary: warc is a pure ruby implementation of Web ARChive file reader and writer
|
182
|
+
test_files:
|
183
|
+
- spec/fixtures/arg.warc
|
184
|
+
- spec/fixtures/criterion.warc
|
185
|
+
- spec/fixtures/criterion.warc.gz
|
186
|
+
- spec/fixtures/frg.warc
|
187
|
+
- spec/fixtures/frg.warc.gz
|
188
|
+
- spec/fixtures/http_imdb
|
189
|
+
- spec/spec_helper.rb
|
190
|
+
- spec/warc/http_spec.rb
|
191
|
+
- spec/warc/record/header_spec.rb
|
192
|
+
- spec/warc/record_spec.rb
|
193
|
+
- spec/warc/stream/gzip_spec.rb
|
194
|
+
- spec/warc/stream/plain_spec.rb
|
195
|
+
- spec/warc/stream_spec.rb
|