web_dump 0.0.1.0 → 0.0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +65 -8
- data/examples/simple_examples.rb +40 -0
- data/lib/web_dump/version.rb +1 -1
- data/web_dump.gemspec +3 -6
- metadata +5 -4
data/README.rdoc
CHANGED
@@ -20,30 +20,87 @@ The main source repository is http://github.com/syborg/web_dump.
|
|
20
20
|
|
21
21
|
== Usage
|
22
22
|
|
23
|
+
=== Intantiating
|
24
|
+
|
23
25
|
First of all ...
|
24
26
|
|
25
27
|
require 'rubygems'
|
26
28
|
require 'web_dump'
|
27
29
|
|
28
|
-
|
29
|
-
array
|
30
|
+
Instantiating an object, you may add some options that can be passed through a Hash
|
30
31
|
|
31
|
-
wd = WebDump
|
32
|
+
wd = WebDump.new :base_dir => '~/mydir', :file_ext => '.gz'
|
32
33
|
|
33
34
|
`wd`, when asked to, will save all files inside expanded directory '~/mydir'
|
34
|
-
with an appended file extension at the end
|
35
|
+
with an appended file extension '.gz' at the end (if not overwritten later)
|
35
36
|
|
36
|
-
|
37
|
+
Some options that could be passed when instantiating an object. Most of them are directly passed along to an UriPathname object that is created.
|
37
38
|
|
38
39
|
* `:file_ext => extension` (String that will be appended at the end to every filename if not changed from _save_ method)
|
39
|
-
|
40
|
-
Most of them are also passed along to an UriPathname object that is created.
|
41
|
-
|
42
40
|
* `:base_dir => dir_name` (directory where everything will be stored. Defaults to '~/web_dumps')
|
43
41
|
* `:pth_sep => psep` (String that will be used to substitute '/' inside URI's path and queries (defaults to UriPathname::PTH_SEP='_|_'))
|
44
42
|
* `:host_sep => hsep` (String that will be used separate the URI¡s hostname and path when constructing the pathname. if '/' is used, hostname will actually become a subdirectory -defaults to UriPathname::HOST_SEP='__|'-)
|
45
43
|
* `:no_path => nopath` (String that will be used as a path placeholder when no URI's path exists, -default UriPathname::NO_PTH = '_NOPATH_'-)
|
46
44
|
|
45
|
+
=== Saving Web Contents
|
46
|
+
|
47
|
+
You should use WebDump#save, for example:
|
48
|
+
|
49
|
+
wd.save "http://hello.world.com/hithere", data
|
50
|
+
|
51
|
+
=== Retrieving Web Contents
|
52
|
+
|
53
|
+
You can retrieve data using two flavoured read methods, using URIs or using
|
54
|
+
pathnames as main argument
|
55
|
+
|
56
|
+
data = wd.read_uri(uri)
|
57
|
+
|
58
|
+
or
|
59
|
+
|
60
|
+
data = wd.read_pathname(f)
|
61
|
+
|
62
|
+
== Example
|
63
|
+
|
64
|
+
Here is a complete example
|
65
|
+
|
66
|
+
require 'rubygems'
|
67
|
+
require 'open-uri'
|
68
|
+
require 'web_dump'
|
69
|
+
|
70
|
+
MY_URIS = [
|
71
|
+
'http://en.wikipedia.org/wiki/Ruby_Bridges',
|
72
|
+
'http://donaldfagen.com/disc_nightfly.php',
|
73
|
+
'http://www.rubi.cat/ajrubi/portada/index.php',
|
74
|
+
'http://www.google.com/cse?q=array&cx=013598269713424429640%3Ag5orptiw95w&ie=UTF-8&sa=Search'
|
75
|
+
]
|
76
|
+
|
77
|
+
# all files will be saved in expanded '~/mydir' with file extension '.gz'
|
78
|
+
wd = WebDump.new :base_dir => '~/mydir', :file_ext => '.gz'
|
79
|
+
|
80
|
+
# Don't care about filenames while saving pages into files
|
81
|
+
puts "Saving data using URIs"
|
82
|
+
MY_URIS.each do |uri|
|
83
|
+
open uri do |u|
|
84
|
+
data = u.read
|
85
|
+
puts wd.save uri, data
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
# Possibly mocking? ... don't care about filenames while retrieving pages from files.
|
90
|
+
puts "\nRetrieving data using URIs"
|
91
|
+
MY_URIS.each do |uri|
|
92
|
+
data = wd.read_uri(uri)
|
93
|
+
puts data[0...100].gsub(/\s+/, ' ').strip
|
94
|
+
end
|
95
|
+
|
96
|
+
# ... or, conversely, use filenames if you need so
|
97
|
+
puts "\nRetrieving data using pathnames"
|
98
|
+
files = Dir[File.expand_path('*.gz', '~/mydir')]
|
99
|
+
files.each do |f|
|
100
|
+
data = wd.read_pathname(f)
|
101
|
+
puts data[0...100].gsub(/\s+/, ' ').strip
|
102
|
+
end
|
103
|
+
|
47
104
|
== Note on Patches/Pull Requests
|
48
105
|
|
49
106
|
* Fork the project.
|
@@ -0,0 +1,40 @@
|
|
1
|
+
# Simple Examples to show up how to use WebDump
|
2
|
+
# Marcel Massana 1-Sep-2011
|
3
|
+
|
4
|
+
require 'rubygems'
|
5
|
+
require 'open-uri'
|
6
|
+
require 'web_dump'
|
7
|
+
|
8
|
+
MY_URIS = [
|
9
|
+
'http://en.wikipedia.org/wiki/Ruby_Bridges',
|
10
|
+
'http://donaldfagen.com/disc_nightfly.php',
|
11
|
+
'http://www.rubi.cat/ajrubi/portada/index.php',
|
12
|
+
'http://www.google.com/cse?q=array&cx=013598269713424429640%3Ag5orptiw95w&ie=UTF-8&sa=Search'
|
13
|
+
]
|
14
|
+
|
15
|
+
# all files will be saved in expanded '~/mydir' with file extension '.gz'
|
16
|
+
wd = WebDump.new :base_dir => '~/mydir', :file_ext => '.gz'
|
17
|
+
|
18
|
+
# Don't care about filenames while saving pages into files
|
19
|
+
puts "Saving data using URIs"
|
20
|
+
MY_URIS.each do |uri|
|
21
|
+
open uri do |u|
|
22
|
+
data = u.read
|
23
|
+
puts wd.save uri, data
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
# Possibly mocking?... also don't care about filenames while retrieving pages from files.
|
28
|
+
puts "\nRetrieving data using URIs"
|
29
|
+
MY_URIS.each do |uri|
|
30
|
+
data = wd.read_uri(uri)
|
31
|
+
puts data[0...100].gsub(/\s+/, ' ').strip
|
32
|
+
end
|
33
|
+
|
34
|
+
# ... or, conversely, use filenames if you need so
|
35
|
+
puts "\nRetrieving data using pathnames"
|
36
|
+
files = Dir[File.expand_path('*.gz', '~/mydir')]
|
37
|
+
files.each do |f|
|
38
|
+
data = wd.read_pathname(f)
|
39
|
+
puts data[0...100].gsub(/\s+/, ' ').strip
|
40
|
+
end
|
data/lib/web_dump/version.rb
CHANGED
data/web_dump.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "web_dump"
|
8
|
-
s.version = "0.0.
|
8
|
+
s.version = "0.0.2.0"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Marcel Massana"]
|
12
|
-
s.date = "2011-
|
12
|
+
s.date = "2011-09-01"
|
13
13
|
s.description = "Saves and Retrieves data given an URI. The filename will be automatically choosed using that URI freeing the user to think about that"
|
14
14
|
s.email = "xaxaupua@gmail.com"
|
15
15
|
s.extra_rdoc_files = [
|
@@ -18,13 +18,10 @@ Gem::Specification.new do |s|
|
|
18
18
|
]
|
19
19
|
s.files = [
|
20
20
|
".document",
|
21
|
-
".goutputstream-6QBL0V",
|
22
|
-
".goutputstream-6X1P0V",
|
23
|
-
".goutputstream-IR2O0V",
|
24
|
-
".goutputstream-TK420V",
|
25
21
|
"LICENSE",
|
26
22
|
"README.rdoc",
|
27
23
|
"Rakefile",
|
24
|
+
"examples/simple_examples.rb",
|
28
25
|
"lib/web_dump.rb",
|
29
26
|
"lib/web_dump/version.rb",
|
30
27
|
"test/test_web_dump.rb",
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: web_dump
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 71
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 0
|
9
|
-
-
|
9
|
+
- 2
|
10
10
|
- 0
|
11
|
-
version: 0.0.
|
11
|
+
version: 0.0.2.0
|
12
12
|
platform: ruby
|
13
13
|
authors:
|
14
14
|
- Marcel Massana
|
@@ -16,7 +16,7 @@ autorequire:
|
|
16
16
|
bindir: bin
|
17
17
|
cert_chain: []
|
18
18
|
|
19
|
-
date: 2011-
|
19
|
+
date: 2011-09-01 00:00:00 Z
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
22
22
|
name: uri_pathname
|
@@ -46,6 +46,7 @@ files:
|
|
46
46
|
- LICENSE
|
47
47
|
- README.rdoc
|
48
48
|
- Rakefile
|
49
|
+
- examples/simple_examples.rb
|
49
50
|
- lib/web_dump.rb
|
50
51
|
- lib/web_dump/version.rb
|
51
52
|
- test/test_web_dump.rb
|