web_dump 0.0.1.0 → 0.0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +65 -8
- data/examples/simple_examples.rb +40 -0
- data/lib/web_dump/version.rb +1 -1
- data/web_dump.gemspec +3 -6
- metadata +5 -4
data/README.rdoc
CHANGED
@@ -20,30 +20,87 @@ The main source repository is http://github.com/syborg/web_dump.
|
|
20
20
|
|
21
21
|
== Usage
|
22
22
|
|
23
|
+
=== Intantiating
|
24
|
+
|
23
25
|
First of all ...
|
24
26
|
|
25
27
|
require 'rubygems'
|
26
28
|
require 'web_dump'
|
27
29
|
|
28
|
-
|
29
|
-
array
|
30
|
+
Instantiating an object, you may add some options that can be passed through a Hash
|
30
31
|
|
31
|
-
wd = WebDump
|
32
|
+
wd = WebDump.new :base_dir => '~/mydir', :file_ext => '.gz'
|
32
33
|
|
33
34
|
`wd`, when asked to, will save all files inside expanded directory '~/mydir'
|
34
|
-
with an appended file extension at the end
|
35
|
+
with an appended file extension '.gz' at the end (if not overwritten later)
|
35
36
|
|
36
|
-
|
37
|
+
Some options that could be passed when instantiating an object. Most of them are directly passed along to an UriPathname object that is created.
|
37
38
|
|
38
39
|
* `:file_ext => extension` (String that will be appended at the end to every filename if not changed from _save_ method)
|
39
|
-
|
40
|
-
Most of them are also passed along to an UriPathname object that is created.
|
41
|
-
|
42
40
|
* `:base_dir => dir_name` (directory where everything will be stored. Defaults to '~/web_dumps')
|
43
41
|
* `:pth_sep => psep` (String that will be used to substitute '/' inside URI's path and queries (defaults to UriPathname::PTH_SEP='_|_'))
|
44
42
|
* `:host_sep => hsep` (String that will be used separate the URI¡s hostname and path when constructing the pathname. if '/' is used, hostname will actually become a subdirectory -defaults to UriPathname::HOST_SEP='__|'-)
|
45
43
|
* `:no_path => nopath` (String that will be used as a path placeholder when no URI's path exists, -default UriPathname::NO_PTH = '_NOPATH_'-)
|
46
44
|
|
45
|
+
=== Saving Web Contents
|
46
|
+
|
47
|
+
You should use WebDump#save, for example:
|
48
|
+
|
49
|
+
wd.save "http://hello.world.com/hithere", data
|
50
|
+
|
51
|
+
=== Retrieving Web Contents
|
52
|
+
|
53
|
+
You can retrieve data using two flavoured read methods, using URIs or using
|
54
|
+
pathnames as main argument
|
55
|
+
|
56
|
+
data = wd.read_uri(uri)
|
57
|
+
|
58
|
+
or
|
59
|
+
|
60
|
+
data = wd.read_pathname(f)
|
61
|
+
|
62
|
+
== Example
|
63
|
+
|
64
|
+
Here is a complete example
|
65
|
+
|
66
|
+
require 'rubygems'
|
67
|
+
require 'open-uri'
|
68
|
+
require 'web_dump'
|
69
|
+
|
70
|
+
MY_URIS = [
|
71
|
+
'http://en.wikipedia.org/wiki/Ruby_Bridges',
|
72
|
+
'http://donaldfagen.com/disc_nightfly.php',
|
73
|
+
'http://www.rubi.cat/ajrubi/portada/index.php',
|
74
|
+
'http://www.google.com/cse?q=array&cx=013598269713424429640%3Ag5orptiw95w&ie=UTF-8&sa=Search'
|
75
|
+
]
|
76
|
+
|
77
|
+
# all files will be saved in expanded '~/mydir' with file extension '.gz'
|
78
|
+
wd = WebDump.new :base_dir => '~/mydir', :file_ext => '.gz'
|
79
|
+
|
80
|
+
# Don't care about filenames while saving pages into files
|
81
|
+
puts "Saving data using URIs"
|
82
|
+
MY_URIS.each do |uri|
|
83
|
+
open uri do |u|
|
84
|
+
data = u.read
|
85
|
+
puts wd.save uri, data
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
# Possibly mocking? ... don't care about filenames while retrieving pages from files.
|
90
|
+
puts "\nRetrieving data using URIs"
|
91
|
+
MY_URIS.each do |uri|
|
92
|
+
data = wd.read_uri(uri)
|
93
|
+
puts data[0...100].gsub(/\s+/, ' ').strip
|
94
|
+
end
|
95
|
+
|
96
|
+
# ... or, conversely, use filenames if you need so
|
97
|
+
puts "\nRetrieving data using pathnames"
|
98
|
+
files = Dir[File.expand_path('*.gz', '~/mydir')]
|
99
|
+
files.each do |f|
|
100
|
+
data = wd.read_pathname(f)
|
101
|
+
puts data[0...100].gsub(/\s+/, ' ').strip
|
102
|
+
end
|
103
|
+
|
47
104
|
== Note on Patches/Pull Requests
|
48
105
|
|
49
106
|
* Fork the project.
|
@@ -0,0 +1,40 @@
|
|
1
|
+
# Simple Examples to show up how to use WebDump
|
2
|
+
# Marcel Massana 1-Sep-2011
|
3
|
+
|
4
|
+
require 'rubygems'
|
5
|
+
require 'open-uri'
|
6
|
+
require 'web_dump'
|
7
|
+
|
8
|
+
MY_URIS = [
|
9
|
+
'http://en.wikipedia.org/wiki/Ruby_Bridges',
|
10
|
+
'http://donaldfagen.com/disc_nightfly.php',
|
11
|
+
'http://www.rubi.cat/ajrubi/portada/index.php',
|
12
|
+
'http://www.google.com/cse?q=array&cx=013598269713424429640%3Ag5orptiw95w&ie=UTF-8&sa=Search'
|
13
|
+
]
|
14
|
+
|
15
|
+
# all files will be saved in expanded '~/mydir' with file extension '.gz'
|
16
|
+
wd = WebDump.new :base_dir => '~/mydir', :file_ext => '.gz'
|
17
|
+
|
18
|
+
# Don't care about filenames while saving pages into files
|
19
|
+
puts "Saving data using URIs"
|
20
|
+
MY_URIS.each do |uri|
|
21
|
+
open uri do |u|
|
22
|
+
data = u.read
|
23
|
+
puts wd.save uri, data
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
# Possibly mocking?... also don't care about filenames while retrieving pages from files.
|
28
|
+
puts "\nRetrieving data using URIs"
|
29
|
+
MY_URIS.each do |uri|
|
30
|
+
data = wd.read_uri(uri)
|
31
|
+
puts data[0...100].gsub(/\s+/, ' ').strip
|
32
|
+
end
|
33
|
+
|
34
|
+
# ... or, conversely, use filenames if you need so
|
35
|
+
puts "\nRetrieving data using pathnames"
|
36
|
+
files = Dir[File.expand_path('*.gz', '~/mydir')]
|
37
|
+
files.each do |f|
|
38
|
+
data = wd.read_pathname(f)
|
39
|
+
puts data[0...100].gsub(/\s+/, ' ').strip
|
40
|
+
end
|
data/lib/web_dump/version.rb
CHANGED
data/web_dump.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "web_dump"
|
8
|
-
s.version = "0.0.
|
8
|
+
s.version = "0.0.2.0"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Marcel Massana"]
|
12
|
-
s.date = "2011-
|
12
|
+
s.date = "2011-09-01"
|
13
13
|
s.description = "Saves and Retrieves data given an URI. The filename will be automatically choosed using that URI freeing the user to think about that"
|
14
14
|
s.email = "xaxaupua@gmail.com"
|
15
15
|
s.extra_rdoc_files = [
|
@@ -18,13 +18,10 @@ Gem::Specification.new do |s|
|
|
18
18
|
]
|
19
19
|
s.files = [
|
20
20
|
".document",
|
21
|
-
".goutputstream-6QBL0V",
|
22
|
-
".goutputstream-6X1P0V",
|
23
|
-
".goutputstream-IR2O0V",
|
24
|
-
".goutputstream-TK420V",
|
25
21
|
"LICENSE",
|
26
22
|
"README.rdoc",
|
27
23
|
"Rakefile",
|
24
|
+
"examples/simple_examples.rb",
|
28
25
|
"lib/web_dump.rb",
|
29
26
|
"lib/web_dump/version.rb",
|
30
27
|
"test/test_web_dump.rb",
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: web_dump
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 71
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 0
|
9
|
-
-
|
9
|
+
- 2
|
10
10
|
- 0
|
11
|
-
version: 0.0.
|
11
|
+
version: 0.0.2.0
|
12
12
|
platform: ruby
|
13
13
|
authors:
|
14
14
|
- Marcel Massana
|
@@ -16,7 +16,7 @@ autorequire:
|
|
16
16
|
bindir: bin
|
17
17
|
cert_chain: []
|
18
18
|
|
19
|
-
date: 2011-
|
19
|
+
date: 2011-09-01 00:00:00 Z
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
22
22
|
name: uri_pathname
|
@@ -46,6 +46,7 @@ files:
|
|
46
46
|
- LICENSE
|
47
47
|
- README.rdoc
|
48
48
|
- Rakefile
|
49
|
+
- examples/simple_examples.rb
|
49
50
|
- lib/web_dump.rb
|
50
51
|
- lib/web_dump/version.rb
|
51
52
|
- test/test_web_dump.rb
|