warc 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,27 @@
1
+ require 'warc'
2
+
3
+ def fixture(path)
4
+ File.expand_path(File.join(File.dirname(__FILE__),'fixtures',path))
5
+ end
6
+
7
+ def read_fixture(path)
8
+ File.read(fixture(path))
9
+ end
10
+
11
+ SAMPLES = {
12
+ "http://www.imdb.com/" => "http_imdb"
13
+ }
14
+
15
+ unless ENV['LIVE_TEST']
16
+ begin
17
+ require 'fakeweb'
18
+
19
+ FakeWeb.allow_net_connect = false
20
+ SAMPLES.each do |url,path|
21
+ FakeWeb.register_uri(:get,url,:response => read_fixture(path))
22
+ end
23
+ rescue LoadError
24
+ puts "Could not load FakeWeb, these tests will hit IMDB.com"
25
+ puts "You can run `gem install fakeweb` to stub out the responses."
26
+ end
27
+ end
@@ -0,0 +1,9 @@
1
+ require 'spec_helper.rb'
2
+
3
+ describe Warc::HTTP do
4
+ it "should be able to return a HTTPResponse object from a warc record" do
5
+ record = ::Warc.open_stream(fixture('criterion.warc.gz')).to_a[2]
6
+ record.to_http.class.should eq (Net::HTTPOK)
7
+ end
8
+
9
+ end
@@ -0,0 +1,37 @@
1
+ require 'spec_helper.rb'
2
+
3
+ describe Warc::Record::Header do
4
+ context "when instansiated with some value" do
5
+ before(:each) do
6
+ @header=Warc::Record.new({
7
+ "WARC-Type"=> "response",
8
+ "WARC-Record-ID" => "<record-1>",
9
+ "WARC-Date" => "2000-01-02T03:04:05Z",
10
+ "Content-Length" => "10"
11
+ }).header
12
+ end
13
+
14
+ subject {@header}
15
+
16
+ its(:type) { should eq "response"}
17
+ it "should have attributes for mandatory fields" do
18
+ @header.type.should eq "response"
19
+ @header.record_id.should eq "<record-1>"
20
+ @header.date.should eq "2000-01-02T03:04:05Z"
21
+ @header.content_length.should eq 10
22
+ end
23
+
24
+ it "should be case-insensitive to field names" do
25
+ ["WARC-Type", "warc-type", "WARC-TYPE"].each do |key|
26
+ @header[key].should eq "response"
27
+ end
28
+ end
29
+
30
+ context "when warc-content-length empty" do
31
+ before {@header.delete("content-length")}
32
+ it "should assume a content-lenght of 0" do
33
+ @header.content_length.should eq 0
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,20 @@
1
+ require 'spec_helper.rb'
2
+
3
+ describe Warc::Record do
4
+ before(:each) do
5
+ @rec = Warc::Record.new
6
+ @rec.content = "asdf"
7
+ @rec.header.replace({
8
+ "WARC-Type"=> "response",
9
+ "WARC-Date" => "2000-01-02T03:04:05Z"
10
+ })
11
+ end
12
+
13
+ it "should have a header" do
14
+ @rec.respond_to?(:header).should eq true
15
+ end
16
+
17
+ it "should compute content-length" do
18
+ @rec.header.content_length.should eq(4)
19
+ end
20
+ end
@@ -0,0 +1,46 @@
1
+ require 'spec_helper.rb'
2
+
3
+ describe Warc::Stream::Gzip do
4
+ context "compressed file single entry" do
5
+ before(:each) do
6
+ @warc=Warc.open_stream(fixture('criterion.warc.gz'))
7
+ @record = @warc.first
8
+ end
9
+
10
+ it "should read the content" do
11
+ @record.content.length.should eq @record.header.content_length
12
+ end
13
+
14
+ it "should parse all headers" do
15
+ @record.header.length.should eq(7)
16
+ end
17
+
18
+ it "should return the headers as an hash" do
19
+ @record.header.should eq({
20
+ "WARC-Type" => "warcinfo",
21
+ "Content-Type" => "application/warc-fields",
22
+ "WARC-Date" => "2012-09-13T22:52:52Z",
23
+ "WARC-Record-ID" => "<urn:uuid:671787C3-3C00-4256-8C5C-386A4D8F7468>",
24
+ "WARC-Filename" => "criterion.warc.gz",
25
+ "WARC-Block-Digest" => "sha1:OX3R5RVY4LFQ6WIPTDCLTY3ABKWLXUBU",
26
+ "Content-Length" => "234"
27
+ })
28
+ end
29
+
30
+ end
31
+
32
+ context "compressed file mutliple entries" do
33
+ before(:each) do
34
+ @warc=Warc.open_stream(fixture('frg.warc.gz'))
35
+ end
36
+
37
+ it "should find all record" do
38
+ @warc.count.should eq 56
39
+ end
40
+ end
41
+
42
+ context "writing record" do
43
+ before(:each) do
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,41 @@
1
+ require 'spec_helper.rb'
2
+
3
+ describe Warc::Stream::Plain do
4
+ context "single entry of an uncompressed file" do
5
+ before(:each) do
6
+ @warc=Warc.open_stream(fixture('criterion.warc'))
7
+ @record = @warc.first
8
+ end
9
+
10
+ it "should parse all headers" do
11
+ @record.header.length.should eq(7)
12
+ end
13
+
14
+ it "should return the headers as an hash" do
15
+ @record.header.should eq({
16
+ "WARC-Type" => "warcinfo",
17
+ "Content-Type" => "application/warc-fields",
18
+ "WARC-Date" => "2012-09-13T22:53:20Z",
19
+ "WARC-Record-ID" => "<urn:uuid:CF5083F4-FEB1-4C63-B3CB-B450AB609875>",
20
+ "WARC-Filename" => "criterion.warc",
21
+ "WARC-Block-Digest" => "sha1:XGSXIX3L7RQGXFU6XJ32NCHXCKN6BBMK",
22
+ "Content-Length" => "258"
23
+ })
24
+ end
25
+
26
+ it "should read the content" do
27
+ @record.content.length.should eq @record.header.content_length
28
+ end
29
+ end
30
+
31
+ context "multiples entries" do
32
+ before(:each) do
33
+ @warc=Warc.open_stream(fixture('frg.warc'))
34
+ end
35
+
36
+ it "should find all record" do
37
+ @warc.count.should eq 56
38
+ end
39
+
40
+ end
41
+ end
@@ -0,0 +1,55 @@
1
+ require 'spec_helper.rb'
2
+ require 'fileutils'
3
+
4
+ describe Warc::Stream do
5
+ before(:each) do
6
+ @rec = Warc::Record.new
7
+ @rec.content = "asdf asdf"
8
+ @rec.header.replace({
9
+ "WARC-Type"=> "response",
10
+ "WARC-Date" => "2000-01-02T03:04:05Z",
11
+ })
12
+ header = @rec.header
13
+ end
14
+
15
+ it "can't be initialized" do
16
+ #s = Warc::Stream.new
17
+ end
18
+
19
+ it "should save to multiple files" do
20
+ s=Warc::Stream::Plain.new('/tmp/test',:max_filesize => 10*10**6)
21
+ 100.times do
22
+ r = Warc::Record.new
23
+ r.content = "0" * (10**6)
24
+ r.header.replace({"WARC-Type"=> "response","WARC-Date" => "2000-01-02T03:04:05Z"})
25
+ s.write_record(r)
26
+ end
27
+ ::File.exists?('/tmp/test.000010.warc').should eq(true)
28
+ FileUtils.rm Dir.glob('/tmp/test.*.warc')
29
+ end
30
+
31
+ it "should dump record to file" do
32
+ s = Warc::Stream::Plain.new('/tmp/test.plain')
33
+ s.write_record(@rec)
34
+ s.close
35
+ end
36
+
37
+ it "should dump gzipped to file" do
38
+ s = Warc::Stream::Gzip.new('/tmp/test.gzip')
39
+ s.write_record(@rec)
40
+ s.close
41
+ end
42
+
43
+ it "should find record" do
44
+ stream = ::Warc.open_stream(fixture('arg.warc'))
45
+ uri = "http://antoineroygobeil.com/"
46
+ record = stream.detect do |rec|
47
+ rec.header["warc-target-uri"] == uri && rec.header["warc-type"] == "response"
48
+ end
49
+ end
50
+
51
+ it "should read record at given offset" do
52
+ stream = ::Warc.open_stream(fixture('arg.warc'))
53
+ stream.record(8287).header.record_id.should eq("<urn:uuid:5D799C11-D46C-4AC8-B598-5DC9F4205C6E>")
54
+ end
55
+ end
@@ -0,0 +1,27 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require File.expand_path('../lib/warc/version', __FILE__)
3
+
4
+ Gem::Specification.new do |gem|
5
+ gem.authors = ["antoine"]
6
+ gem.email = ["roygobeil.antoine@gmail.com"]
7
+ #gem.description = %q{TODO: Write a gem description}
8
+ gem.summary = %q{warc is a pure ruby implementation of Web ARChive file reader and writer}
9
+ gem.homepage = ""
10
+
11
+ gem.license = "MIT"
12
+
13
+ gem.files = `git ls-files`.split($\)
14
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
15
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
16
+ gem.name = "warc"
17
+ gem.require_paths = ["lib"]
18
+ gem.version = Warc::VERSION
19
+
20
+ gem.add_dependency("uuid")
21
+ gem.add_dependency("activemodel")
22
+ gem.add_dependency("rack")
23
+ gem.add_dependency("thor")
24
+ gem.add_dependency("rack-contrib")
25
+ gem.add_dependency("sinatra")
26
+ gem.add_dependency("thin")
27
+ end
metadata ADDED
@@ -0,0 +1,195 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: warc
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ platform: ruby
6
+ authors:
7
+ - antoine
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2013-10-24 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: uuid
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ! '>='
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ! '>='
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: activemodel
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ! '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ! '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rack
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ! '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ! '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: thor
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ! '>='
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: rack-contrib
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ! '>='
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ! '>='
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: sinatra
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ! '>='
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ! '>='
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: thin
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ! '>='
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :runtime
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ! '>='
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ description:
112
+ email:
113
+ - roygobeil.antoine@gmail.com
114
+ executables:
115
+ - warc
116
+ extensions: []
117
+ extra_rdoc_files: []
118
+ files:
119
+ - .gitignore
120
+ - Gemfile
121
+ - LICENSE
122
+ - README.md
123
+ - Rakefile
124
+ - bin/warc
125
+ - lib/warc.rb
126
+ - lib/warc/cli.rb
127
+ - lib/warc/exceptions.rb
128
+ - lib/warc/ext/net_http.rb
129
+ - lib/warc/http.rb
130
+ - lib/warc/parser.rb
131
+ - lib/warc/proxy.rb
132
+ - lib/warc/proxy/css/main.css
133
+ - lib/warc/proxy/proxy.rb
134
+ - lib/warc/proxy/views/index.erb
135
+ - lib/warc/proxy/views/layout.erb
136
+ - lib/warc/record.rb
137
+ - lib/warc/record/header.rb
138
+ - lib/warc/record/validator.rb
139
+ - lib/warc/stream.rb
140
+ - lib/warc/stream/gzip.rb
141
+ - lib/warc/stream/plain.rb
142
+ - lib/warc/utils/header_hash.rb
143
+ - lib/warc/version.rb
144
+ - spec/fixtures/arg.warc
145
+ - spec/fixtures/criterion.warc
146
+ - spec/fixtures/criterion.warc.gz
147
+ - spec/fixtures/frg.warc
148
+ - spec/fixtures/frg.warc.gz
149
+ - spec/fixtures/http_imdb
150
+ - spec/spec_helper.rb
151
+ - spec/warc/http_spec.rb
152
+ - spec/warc/record/header_spec.rb
153
+ - spec/warc/record_spec.rb
154
+ - spec/warc/stream/gzip_spec.rb
155
+ - spec/warc/stream/plain_spec.rb
156
+ - spec/warc/stream_spec.rb
157
+ - warc.gemspec
158
+ homepage: ''
159
+ licenses:
160
+ - MIT
161
+ metadata: {}
162
+ post_install_message:
163
+ rdoc_options: []
164
+ require_paths:
165
+ - lib
166
+ required_ruby_version: !ruby/object:Gem::Requirement
167
+ requirements:
168
+ - - ! '>='
169
+ - !ruby/object:Gem::Version
170
+ version: '0'
171
+ required_rubygems_version: !ruby/object:Gem::Requirement
172
+ requirements:
173
+ - - ! '>='
174
+ - !ruby/object:Gem::Version
175
+ version: '0'
176
+ requirements: []
177
+ rubyforge_project:
178
+ rubygems_version: 2.1.9
179
+ signing_key:
180
+ specification_version: 4
181
+ summary: warc is a pure ruby implementation of Web ARChive file reader and writer
182
+ test_files:
183
+ - spec/fixtures/arg.warc
184
+ - spec/fixtures/criterion.warc
185
+ - spec/fixtures/criterion.warc.gz
186
+ - spec/fixtures/frg.warc
187
+ - spec/fixtures/frg.warc.gz
188
+ - spec/fixtures/http_imdb
189
+ - spec/spec_helper.rb
190
+ - spec/warc/http_spec.rb
191
+ - spec/warc/record/header_spec.rb
192
+ - spec/warc/record_spec.rb
193
+ - spec/warc/stream/gzip_spec.rb
194
+ - spec/warc/stream/plain_spec.rb
195
+ - spec/warc/stream_spec.rb