web_dump 0.0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/LICENSE +20 -0
- data/README.rdoc +59 -0
- data/Rakefile +58 -0
- data/lib/web_dump/version.rb +16 -0
- data/lib/web_dump.rb +104 -0
- data/test/test_web_dump.rb +47 -0
- data/web_dump.gemspec +50 -0
- metadata +87 -0
data/.document
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2009 Marcel Massana
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.rdoc
ADDED
@@ -0,0 +1,59 @@
|
|
1
|
+
= web_dump
|
2
|
+
|
3
|
+
Little tiny class to easily save and retrieve web pages
|
4
|
+
|
5
|
+
In web related client applications, such as spiders, it is frequently necessary
|
6
|
+
to save pages into files with adecuate naming convention. WebDump comes to the
|
7
|
+
rescue. It manages the details of assigning unique readable names and save files
|
8
|
+
after URIs that have been visited. Additionally, saving data could also be
|
9
|
+
conveniently compressed with gzip for deep web spidering. It only depends on
|
10
|
+
telling the correct file extension when saving.
|
11
|
+
|
12
|
+
Conversely, file read operation is available through convenient methods
|
13
|
+
indicating either a pathname or a URI.
|
14
|
+
|
15
|
+
== Installation
|
16
|
+
|
17
|
+
$ sudo gem install web_dump
|
18
|
+
|
19
|
+
The main source repository is http://github.com/syborg/web_dump.
|
20
|
+
|
21
|
+
== Usage
|
22
|
+
|
23
|
+
First of all ...
|
24
|
+
|
25
|
+
require 'rubygems'
|
26
|
+
require 'web_dump'
|
27
|
+
|
28
|
+
Instantiate an object. You may add some options that can be passed through an
|
29
|
+
array
|
30
|
+
|
31
|
+
wd = WebDump,new :base_dir => '~/mydir', :file_ext => '.gz'
|
32
|
+
|
33
|
+
`wd`, when asked to, will save all files inside expanded directory '~/mydir'
|
34
|
+
with an appended file extension at the end '.gz' (if not overwriten later)
|
35
|
+
|
36
|
+
Other options could be passed when instantiating an object.
|
37
|
+
|
38
|
+
* `:file_ext => extension` (String that will be appended at the end to every filename if not changed from _save_ method)
|
39
|
+
|
40
|
+
Most of them are also passed along to an UriPathname object that is created.
|
41
|
+
|
42
|
+
* `:base_dir => dir_name` (directory where everything will be stored. Defaults to '~/web_dumps')
|
43
|
+
* `:pth_sep => psep` (String that will be used to substitute '/' inside URI's path and queries (defaults to UriPathname::PTH_SEP='_|_'))
|
44
|
+
* `:host_sep => hsep` (String that will be used separate the URI¡s hostname and path when constructing the pathname. if '/' is used, hostname will actually become a subdirectory -defaults to UriPathname::HOST_SEP='__|'-)
|
45
|
+
* `:no_path => nopath` (String that will be used as a path placeholder when no URI's path exists, -default UriPathname::NO_PTH = '_NOPATH_'-)
|
46
|
+
|
47
|
+
== Note on Patches/Pull Requests
|
48
|
+
|
49
|
+
* Fork the project.
|
50
|
+
* Make your feature addition or bug fix.
|
51
|
+
* Add tests for it. This is important so I don't break it in a
|
52
|
+
future version unintentionally.
|
53
|
+
* Commit, do not mess with rakefile, version, or history.
|
54
|
+
(if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
|
55
|
+
* Send me a pull request. Bonus points for topic branches.
|
56
|
+
|
57
|
+
== Copyright
|
58
|
+
|
59
|
+
Copyright (c) 2011 Marcel Massana. See LICENSE for details.
|
data/Rakefile
ADDED
@@ -0,0 +1,58 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'jeweler'
|
6
|
+
require './lib/web_dump/version'
|
7
|
+
Jeweler::Tasks.new do |gem|
|
8
|
+
gem.name = "web_dump"
|
9
|
+
gem.summary = %Q{Saves and Retrieves data in files given an URI}
|
10
|
+
gem.description = %Q{Saves and Retrieves data given an URI. The filename
|
11
|
+
will be automatically choosed using that URI freeing the user to think
|
12
|
+
about that}.gsub(/\s+/,' ')
|
13
|
+
gem.email = "xaxaupua@gmail.com"
|
14
|
+
gem.homepage = "http://github.com/syborg/web_dump"
|
15
|
+
gem.authors = ["Marcel Massana"]
|
16
|
+
gem.add_dependency "uri_pathname", ">= 0"
|
17
|
+
# gem.add_development_dependency "thoughtbot-shoulda", ">= 0"
|
18
|
+
gem.version = WebDump::Version::STRING
|
19
|
+
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
|
20
|
+
end
|
21
|
+
Jeweler::GemcutterTasks.new
|
22
|
+
rescue LoadError
|
23
|
+
puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
|
24
|
+
end
|
25
|
+
|
26
|
+
require 'rake/testtask'
|
27
|
+
Rake::TestTask.new(:test) do |test|
|
28
|
+
test.libs << 'lib' << 'test'
|
29
|
+
test.pattern = 'test/**/test_*.rb'
|
30
|
+
test.verbose = true
|
31
|
+
end
|
32
|
+
|
33
|
+
begin
|
34
|
+
require 'rcov/rcovtask'
|
35
|
+
Rcov::RcovTask.new do |test|
|
36
|
+
test.libs << 'test'
|
37
|
+
test.pattern = 'test/**/test_*.rb'
|
38
|
+
test.verbose = true
|
39
|
+
end
|
40
|
+
rescue LoadError
|
41
|
+
task :rcov do
|
42
|
+
abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
task :test => :check_dependencies
|
47
|
+
|
48
|
+
task :default => :test
|
49
|
+
|
50
|
+
require 'rake/rdoctask'
|
51
|
+
Rake::RDocTask.new do |rdoc|
|
52
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
53
|
+
|
54
|
+
rdoc.rdoc_dir = 'rdoc'
|
55
|
+
rdoc.title = "web_dump #{version}"
|
56
|
+
rdoc.rdoc_files.include('README*')
|
57
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
58
|
+
end
|
data/lib/web_dump.rb
ADDED
@@ -0,0 +1,104 @@
|
|
1
|
+
# WebDump
|
2
|
+
# MME 31/8/2011
|
3
|
+
#
|
4
|
+
# Allows saving and reading data related to URIs (i.e. pages)
|
5
|
+
|
6
|
+
require 'web_dump/version'
|
7
|
+
require 'uri'
|
8
|
+
require 'zlib'
|
9
|
+
require 'fileutils'
|
10
|
+
require 'rubygems'
|
11
|
+
require 'uri_pathname'
|
12
|
+
|
13
|
+
# Allows saving and reading data related to URIs (i.e. pages)
|
14
|
+
class WebDump
|
15
|
+
|
16
|
+
#default attributes
|
17
|
+
DEFAULT_ATTRS = {
|
18
|
+
:base_dir => '~/web_dumps',
|
19
|
+
:file_ext => '.html'
|
20
|
+
}
|
21
|
+
|
22
|
+
attr_accessor :up, *(DEFAULT_ATTRS.keys)
|
23
|
+
|
24
|
+
# initializes a WebDump object. +options+ should be a hash with options for
|
25
|
+
# an UriPathname object that will be internally created. Default UriPathnames
|
26
|
+
# options and additionally:
|
27
|
+
# :base_dir => directory where everything will be stored (def. '~/web_dumps')
|
28
|
+
# :file_ext => extension that will be appended to filenames (def. '.html')
|
29
|
+
def initialize(options = {})
|
30
|
+
|
31
|
+
attributes = DEFAULT_ATTRS.merge options if options.is_a? Hash
|
32
|
+
attributes.each { |k,v| instance_eval("@#{k}='#{v}'") if DEFAULT_ATTRS.keys.include?(k) }
|
33
|
+
|
34
|
+
@up=UriPathname.new attributes # any valid option passed will be delivered
|
35
|
+
|
36
|
+
end
|
37
|
+
|
38
|
+
# saves the +content+ (String) into a file named after
|
39
|
+
# UriPathname#uri_to_pathname(+uri+).
|
40
|
+
# If +extension+ is nil initialize :file_ext option will be used:
|
41
|
+
# 'anything'+'.gz' -> gzipped (less storage requirements)
|
42
|
+
# other -> as is
|
43
|
+
# returns a String containing the complete pathname of the file if OK else nil
|
44
|
+
def save(uri, content, extension = nil)
|
45
|
+
extension = @file_ext unless extension
|
46
|
+
pathname = @up.uri_to_pathname(uri,nil,extension)
|
47
|
+
return nil unless pathname
|
48
|
+
mkdir_if_not_exists(File.dirname(pathname))
|
49
|
+
num_bytes = nil
|
50
|
+
case extension
|
51
|
+
when /\.gz$/ # ...gz
|
52
|
+
File.open(pathname, 'w') do |f|
|
53
|
+
gz = Zlib::GzipWriter.new(f)
|
54
|
+
# gz.comment="#dumped with web_dump #{Version::STRING}: #{uri}" # no sembla fer res
|
55
|
+
num_bytes = gz.write content
|
56
|
+
gz.close
|
57
|
+
end
|
58
|
+
else # any other
|
59
|
+
File.open(pathname, 'w') do |f|
|
60
|
+
num_bytes = f.write(content)
|
61
|
+
end
|
62
|
+
end
|
63
|
+
num_bytes ? pathname : nil
|
64
|
+
end
|
65
|
+
|
66
|
+
# returns the stored content corresponding to file +pathname+. In case there
|
67
|
+
# isn't any file it returns nil.
|
68
|
+
def read_pathname(pathname)
|
69
|
+
content = nil
|
70
|
+
arr = @up.parse pathname
|
71
|
+
complete_pathname = File.expand_path(pathname)
|
72
|
+
extension = arr[2]
|
73
|
+
case extension
|
74
|
+
when /.gz/
|
75
|
+
File.open(complete_pathname, 'r') do |f|
|
76
|
+
gz = Zlib::GzipReader.new(f)
|
77
|
+
content = gz.read
|
78
|
+
gz.close
|
79
|
+
end
|
80
|
+
else # others as is
|
81
|
+
File.open(complete_pathname, 'r') do |f|
|
82
|
+
content = f.read
|
83
|
+
end
|
84
|
+
end
|
85
|
+
content
|
86
|
+
end
|
87
|
+
|
88
|
+
# returns the stored content corresponding to +uri+ URI. In case there
|
89
|
+
# isn't any file it returns nil.
|
90
|
+
def read_uri(uri, filext=nil)
|
91
|
+
filext = @file_ext unless filext
|
92
|
+
pathname = @up.uri_to_pathname(uri,nil,filext)
|
93
|
+
read_pathname(pathname)
|
94
|
+
end
|
95
|
+
|
96
|
+
private
|
97
|
+
|
98
|
+
# creates +directory+ if it doesn't exist
|
99
|
+
def mkdir_if_not_exists(directory)
|
100
|
+
dir = File.expand_path(directory)
|
101
|
+
FileUtils.mkdir_p(dir) unless (File.exist?(dir) and File.directory?(dir))
|
102
|
+
end
|
103
|
+
|
104
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
require 'test/unit'
|
2
|
+
require 'web_dump'
|
3
|
+
require 'fileutils'
|
4
|
+
|
5
|
+
class TC_WebDump < Test::Unit::TestCase
|
6
|
+
|
7
|
+
TEST_DIR = '~/tmp/web_dump'
|
8
|
+
|
9
|
+
# called before every test
|
10
|
+
def setup
|
11
|
+
FileUtils.remove_dir(File.expand_path(TEST_DIR), true)
|
12
|
+
@wd = WebDump.new :base_dir => TEST_DIR
|
13
|
+
end
|
14
|
+
|
15
|
+
# called after every test
|
16
|
+
def teardown
|
17
|
+
FileUtils.remove_dir(File.expand_path(TEST_DIR), true)
|
18
|
+
end
|
19
|
+
|
20
|
+
def test_automatic_dir_and_file_creation
|
21
|
+
wd = WebDump.new :base_dir => TEST_DIR, :host_sep => '/'
|
22
|
+
pathname = wd.save 'http://www.fake.fak/fakpath', 'Hello World!'
|
23
|
+
assert(File.exist?(pathname))
|
24
|
+
end
|
25
|
+
|
26
|
+
def test_raw_file_sr_cycle
|
27
|
+
input = 'Hello World!'
|
28
|
+
uri = 'http://www.com/prova'
|
29
|
+
pathname = @wd.save(uri,input)
|
30
|
+
output = @wd.read_pathname(pathname)
|
31
|
+
assert_equal input, output, "retrieved through pathname"
|
32
|
+
output = @wd.read_uri(uri)
|
33
|
+
assert_equal input, output, "retrieved through uri"
|
34
|
+
end
|
35
|
+
|
36
|
+
def test_gzipped_file_sr_cycle
|
37
|
+
input = 'Hello World!'
|
38
|
+
uri = 'http://www.com/prova'
|
39
|
+
pathname = @wd.save(uri,input,".gz")
|
40
|
+
output = @wd.read_pathname(pathname)
|
41
|
+
assert_equal input, output, "retrieved through pathname"
|
42
|
+
output = @wd.read_uri(uri,"gz")
|
43
|
+
assert_equal input, output, "retrieved through uri"
|
44
|
+
end
|
45
|
+
|
46
|
+
|
47
|
+
end
|
data/web_dump.gemspec
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = "web_dump"
|
8
|
+
s.version = "0.0.1.0"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["Marcel Massana"]
|
12
|
+
s.date = "2011-08-31"
|
13
|
+
s.description = "Saves and Retrieves data given an URI. The filename will be automatically choosed using that URI freeing the user to think about that"
|
14
|
+
s.email = "xaxaupua@gmail.com"
|
15
|
+
s.extra_rdoc_files = [
|
16
|
+
"LICENSE",
|
17
|
+
"README.rdoc"
|
18
|
+
]
|
19
|
+
s.files = [
|
20
|
+
".document",
|
21
|
+
".goutputstream-6QBL0V",
|
22
|
+
".goutputstream-6X1P0V",
|
23
|
+
".goutputstream-IR2O0V",
|
24
|
+
".goutputstream-TK420V",
|
25
|
+
"LICENSE",
|
26
|
+
"README.rdoc",
|
27
|
+
"Rakefile",
|
28
|
+
"lib/web_dump.rb",
|
29
|
+
"lib/web_dump/version.rb",
|
30
|
+
"test/test_web_dump.rb",
|
31
|
+
"web_dump.gemspec"
|
32
|
+
]
|
33
|
+
s.homepage = "http://github.com/syborg/web_dump"
|
34
|
+
s.require_paths = ["lib"]
|
35
|
+
s.rubygems_version = "1.8.10"
|
36
|
+
s.summary = "Saves and Retrieves data in files given an URI"
|
37
|
+
|
38
|
+
if s.respond_to? :specification_version then
|
39
|
+
s.specification_version = 3
|
40
|
+
|
41
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
42
|
+
s.add_runtime_dependency(%q<uri_pathname>, [">= 0"])
|
43
|
+
else
|
44
|
+
s.add_dependency(%q<uri_pathname>, [">= 0"])
|
45
|
+
end
|
46
|
+
else
|
47
|
+
s.add_dependency(%q<uri_pathname>, [">= 0"])
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
metadata
ADDED
@@ -0,0 +1,87 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: web_dump
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 75
|
5
|
+
prerelease:
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 0
|
9
|
+
- 1
|
10
|
+
- 0
|
11
|
+
version: 0.0.1.0
|
12
|
+
platform: ruby
|
13
|
+
authors:
|
14
|
+
- Marcel Massana
|
15
|
+
autorequire:
|
16
|
+
bindir: bin
|
17
|
+
cert_chain: []
|
18
|
+
|
19
|
+
date: 2011-08-31 00:00:00 Z
|
20
|
+
dependencies:
|
21
|
+
- !ruby/object:Gem::Dependency
|
22
|
+
name: uri_pathname
|
23
|
+
prerelease: false
|
24
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ">="
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
hash: 3
|
30
|
+
segments:
|
31
|
+
- 0
|
32
|
+
version: "0"
|
33
|
+
type: :runtime
|
34
|
+
version_requirements: *id001
|
35
|
+
description: Saves and Retrieves data given an URI. The filename will be automatically choosed using that URI freeing the user to think about that
|
36
|
+
email: xaxaupua@gmail.com
|
37
|
+
executables: []
|
38
|
+
|
39
|
+
extensions: []
|
40
|
+
|
41
|
+
extra_rdoc_files:
|
42
|
+
- LICENSE
|
43
|
+
- README.rdoc
|
44
|
+
files:
|
45
|
+
- .document
|
46
|
+
- LICENSE
|
47
|
+
- README.rdoc
|
48
|
+
- Rakefile
|
49
|
+
- lib/web_dump.rb
|
50
|
+
- lib/web_dump/version.rb
|
51
|
+
- test/test_web_dump.rb
|
52
|
+
- web_dump.gemspec
|
53
|
+
homepage: http://github.com/syborg/web_dump
|
54
|
+
licenses: []
|
55
|
+
|
56
|
+
post_install_message:
|
57
|
+
rdoc_options: []
|
58
|
+
|
59
|
+
require_paths:
|
60
|
+
- lib
|
61
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
62
|
+
none: false
|
63
|
+
requirements:
|
64
|
+
- - ">="
|
65
|
+
- !ruby/object:Gem::Version
|
66
|
+
hash: 3
|
67
|
+
segments:
|
68
|
+
- 0
|
69
|
+
version: "0"
|
70
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
71
|
+
none: false
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
hash: 3
|
76
|
+
segments:
|
77
|
+
- 0
|
78
|
+
version: "0"
|
79
|
+
requirements: []
|
80
|
+
|
81
|
+
rubyforge_project:
|
82
|
+
rubygems_version: 1.8.10
|
83
|
+
signing_key:
|
84
|
+
specification_version: 3
|
85
|
+
summary: Saves and Retrieves data in files given an URI
|
86
|
+
test_files: []
|
87
|
+
|