extraloop-redis-storage 0.0.5 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +7 -0
- data/README.rdoc +27 -3
- data/bin/extraloop +3 -3
- data/lib/extraloop/redis-storage.rb +2 -0
- data/lib/extraloop/redis-storage/scraping_session.rb +8 -0
- metadata +15 -15
data/History.txt
CHANGED
data/README.rdoc
CHANGED
@@ -3,8 +3,9 @@
|
|
3
3
|
== Description
|
4
4
|
|
5
5
|
Persistence layer for the {ExtraLoop}[https://github.com/afiore/extraloop] data extraction toolkit.
|
6
|
-
|
7
|
-
makes storing
|
6
|
+
This module is implemented as a wrapper around {Ohm}[http://ohm.keyvalue.org], an object-hash mapping library which
|
7
|
+
makes easy storing structured data into Redis. It comes with a convinent command line tool, which allows to
|
8
|
+
list, filter, delete, and export harvested datasets.
|
8
9
|
|
9
10
|
== Installation
|
10
11
|
|
@@ -44,11 +45,34 @@ session instance:
|
|
44
45
|
reviews = scraper.session.records AmazonReview
|
45
46
|
|
46
47
|
|
47
|
-
|
48
|
+
=== The #set_storage method
|
48
49
|
|
49
50
|
The +set_storage+ method can be called with the following arguments:
|
50
51
|
|
51
52
|
* _model_ A Ruby constant specifying the model to be used for storing the extracted data .
|
52
53
|
* _session_title_ A human readable title for the extracted dataset (optional).
|
53
54
|
|
55
|
+
== Command line interface
|
54
56
|
|
57
|
+
Once installed, the gem will also add to your system path the +extraloop+ executable, a command line interface to the datasets harvested through extraloop.
|
58
|
+
A list of datasets can be obtained by running:
|
59
|
+
|
60
|
+
extraloop datastore list:
|
61
|
+
|
62
|
+
This will generate a table like the following one:
|
63
|
+
<code>
|
64
|
+
id | title | model | records
|
65
|
+
--------------------------------------------------------------------
|
66
|
+
48 | 1330106699 GoogleNewsStory Dataset | GoogleNewsStory | 110
|
67
|
+
49 | 1330106948 AmazonReview Dataset | AmazonReview | 0
|
68
|
+
51 | 1330107087 GoogleNewsStory Dataset | GoogleNewsStory | 110
|
69
|
+
52 | 1330111630 AmazonReview Dataset | AmazonReview | 10
|
70
|
+
</code>
|
71
|
+
|
72
|
+
Datasets can by removed using the +delete+ subcommand:
|
73
|
+
|
74
|
+
extraloop datastore delete [id]
|
75
|
+
|
76
|
+
Where +id+ is either a single scraping session id, or a session id range (e.g. 48..52).
|
77
|
+
Finally, the +export+ subcommand allows to export one or several datasets into a JSON or CSV documents.
|
78
|
+
Please refer to the executable inline help (<code>extraloop datastore help [command]</code>) for more usage information.
|
data/bin/extraloop
CHANGED
@@ -34,7 +34,7 @@ class DataStoreCommand < Thor
|
|
34
34
|
|
35
35
|
desc "export [sessions]", "Export datasets by session id or session id range"
|
36
36
|
|
37
|
-
method_option :format, :default => 'json', :type => 'string'
|
37
|
+
method_option :format, :default => 'json', :type => 'string', :aliases => "-f"
|
38
38
|
method_option :directory, :default => "#{Time.now.to_i}-export", :aliases => "-o"
|
39
39
|
method_option :require, :type => 'array', :aliases => "-r"
|
40
40
|
|
@@ -43,8 +43,8 @@ class DataStoreCommand < Thor
|
|
43
43
|
require_files paths if paths.any?
|
44
44
|
format = options[:format]
|
45
45
|
dir = options[:directory]
|
46
|
-
|
47
|
-
raise
|
46
|
+
exception = DataStoreCommand::Exceptions::FormatNotImplemented.new "Format not supported #{format}"
|
47
|
+
raise exception unless %w[json csv].include? format
|
48
48
|
FileUtils.mkdir(dir) unless File.exists? dir
|
49
49
|
|
50
50
|
(filter sessions).each do |session|
|
@@ -25,6 +25,8 @@ module ExtraLoop
|
|
25
25
|
end
|
26
26
|
end
|
27
27
|
|
28
|
+
autoload :CSV, 'csv'
|
29
|
+
autoload :Iconv, 'iconv'
|
28
30
|
ExtraLoop::Storage.autoload :Record, "#{base_path}/redis-storage/record.rb"
|
29
31
|
ExtraLoop::Storage.autoload :ScrapingSession, "#{base_path}/redis-storage/scraping_session.rb"
|
30
32
|
ExtraLoop::Storage.autoload :Model, "#{base_path}/redis-storage/model.rb"
|
@@ -1,11 +1,15 @@
|
|
1
1
|
class ExtraLoop::Storage::ScrapingSession < Ohm::Model
|
2
2
|
|
3
|
+
BOM = "\377\376" #Byte Order Mark
|
4
|
+
|
3
5
|
include Ohm::Boundaries
|
4
6
|
include Ohm::Timestamping
|
5
7
|
include Ohm::Callbacks
|
6
8
|
|
7
9
|
attribute :title
|
8
10
|
reference :model, ExtraLoop::Storage::Model
|
11
|
+
|
12
|
+
|
9
13
|
|
10
14
|
def records(params={})
|
11
15
|
klass = if Object.const_defined?(model.name)
|
@@ -47,5 +51,9 @@ class ExtraLoop::Storage::ScrapingSession < Ohm::Model
|
|
47
51
|
end
|
48
52
|
|
49
53
|
def to_csv
|
54
|
+
_records = Array records.all.map &:to_hash
|
55
|
+
header = _records.first && _records.first.keys.map(&:to_s)
|
56
|
+
data = [header].concat _records.map(&:values)
|
57
|
+
output = data.map { |cells| CSV.generate_line cells }.join
|
50
58
|
end
|
51
59
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: extraloop-redis-storage
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.6
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -13,7 +13,7 @@ date: 2012-02-26 00:00:00.000000000Z
|
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: extraloop
|
16
|
-
requirement: &
|
16
|
+
requirement: &10773840 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ~>
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: 0.0.3
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *10773840
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: ohm
|
27
|
-
requirement: &
|
27
|
+
requirement: &10773380 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ~>
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: 0.1.3
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *10773380
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: ohm-contrib
|
38
|
-
requirement: &
|
38
|
+
requirement: &10772860 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ~>
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: 0.1.2
|
44
44
|
type: :runtime
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *10772860
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: thor
|
49
|
-
requirement: &
|
49
|
+
requirement: &10772260 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - =
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: 0.14.6
|
55
55
|
type: :runtime
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *10772260
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: rspec
|
60
|
-
requirement: &
|
60
|
+
requirement: &10771600 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ~>
|
@@ -65,10 +65,10 @@ dependencies:
|
|
65
65
|
version: 2.7.0
|
66
66
|
type: :development
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *10771600
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: rr
|
71
|
-
requirement: &
|
71
|
+
requirement: &10771000 !ruby/object:Gem::Requirement
|
72
72
|
none: false
|
73
73
|
requirements:
|
74
74
|
- - ~>
|
@@ -76,10 +76,10 @@ dependencies:
|
|
76
76
|
version: 1.0.4
|
77
77
|
type: :development
|
78
78
|
prerelease: false
|
79
|
-
version_requirements: *
|
79
|
+
version_requirements: *10771000
|
80
80
|
- !ruby/object:Gem::Dependency
|
81
81
|
name: pry
|
82
|
-
requirement: &
|
82
|
+
requirement: &10770520 !ruby/object:Gem::Requirement
|
83
83
|
none: false
|
84
84
|
requirements:
|
85
85
|
- - ~>
|
@@ -87,7 +87,7 @@ dependencies:
|
|
87
87
|
version: 0.9.7.4
|
88
88
|
type: :development
|
89
89
|
prerelease: false
|
90
|
-
version_requirements: *
|
90
|
+
version_requirements: *10770520
|
91
91
|
description: Redis+Ohm based storage for data sets extracted using the ExtraLoop toolkit.
|
92
92
|
email: andrea.giulio.fiore@googlemail.com
|
93
93
|
executables:
|