extraloop 0.0.5 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,7 +4,7 @@ results = []
|
|
4
4
|
|
5
5
|
ExtraLoop::IterativeScraper.new("https://www.google.com/search?tbm=nws&q=Egypt", :log => {
|
6
6
|
:log_level => :debug,
|
7
|
-
:appenders => [Logging.appenders.stderr ]
|
7
|
+
:appenders => [ Logging.appenders.stderr ]
|
8
8
|
|
9
9
|
}).set_iteration(:start, (1..101).step(10)).
|
10
10
|
loop_on("h3") { |nodes| nodes.map(&:parent) }.
|
@@ -23,7 +23,9 @@ options = {
|
|
23
23
|
:log_level => :info
|
24
24
|
}
|
25
25
|
}
|
26
|
-
request_arguments = { :params => params
|
26
|
+
request_arguments = { :params => params, :headers => {
|
27
|
+
"User-Agent" => "ExtraLoop - ruby data extraction toolkit: http://github.com/afiore/extraloop"
|
28
|
+
}}
|
27
29
|
|
28
30
|
|
29
31
|
#
|
@@ -39,9 +41,9 @@ ExtraLoop::IterativeScraper.new(api_url, options, request_arguments).
|
|
39
41
|
extract(:ns).
|
40
42
|
extract(:type).
|
41
43
|
extract(:timestamp).
|
42
|
-
on(:data
|
44
|
+
on(:data) do |results|
|
43
45
|
results.each { |record| all_results << record }
|
44
|
-
|
46
|
+
end.
|
45
47
|
continue_with(:cmcontinue, ['query-continue', 'categorymembers', 'cmcontinue']).
|
46
48
|
run()
|
47
49
|
|
@@ -29,7 +29,9 @@ class WikipediaCategoryScraper < ExtraLoop::IterativeScraper
|
|
29
29
|
:format => :json,
|
30
30
|
:log => false
|
31
31
|
}
|
32
|
-
request_arguments = { :params => params
|
32
|
+
request_arguments = { :params => params, :headers => {
|
33
|
+
"User-Agent" => "ExtraLoop - ruby data extraction toolkit: http://github.com/afiore/extraloop"
|
34
|
+
}}
|
33
35
|
|
34
36
|
super(@@api_url, options, request_arguments)
|
35
37
|
|
@@ -38,17 +40,16 @@ class WikipediaCategoryScraper < ExtraLoop::IterativeScraper
|
|
38
40
|
extract(:ns).
|
39
41
|
extract(:type).
|
40
42
|
extract(:timestamp).
|
41
|
-
on(:data
|
43
|
+
on(:data) do |results|
|
42
44
|
puts "#{"\t" * (@options[:depth] - 2).abs } #{@scraper.request_arguments[:params][:cmtitle]}"
|
43
45
|
categories = results.select{ |record| record.ns === 14 }.each { |category| results.delete(category) }
|
44
46
|
|
45
|
-
|
46
47
|
categories.each do |record|
|
47
48
|
# Instanciate a sub scraper if the current depth is greater than zero and the category member is a sub category.
|
48
49
|
WikipediaCategoryScraper.new(record.title, @options[:depth] - 1, @scraper.request_arguments[:params][:cmtitle] ).run unless @options[:depth] <= 0
|
49
50
|
end
|
50
51
|
|
51
|
-
|
52
|
+
end.
|
52
53
|
continue_with(:cmcontinue, ['query-continue', 'categorymembers', 'cmcontinue'])
|
53
54
|
end
|
54
55
|
end
|
@@ -89,8 +89,8 @@ module ExtraLoop
|
|
89
89
|
# Returns itself.
|
90
90
|
#
|
91
91
|
|
92
|
-
def set_iteration(param, *args)
|
93
|
-
|
92
|
+
def set_iteration(param, *args, &block)
|
93
|
+
args << block if block
|
94
94
|
if args.first.respond_to?(:map)
|
95
95
|
@iteration_set = Array(args.first).map &:to_s
|
96
96
|
else
|
@@ -110,7 +110,8 @@ module ExtraLoop
|
|
110
110
|
#
|
111
111
|
# Returns itself.
|
112
112
|
|
113
|
-
def continue_with(param, *extractor_args)
|
113
|
+
def continue_with(param, *extractor_args, &block)
|
114
|
+
extractor_args << block if block
|
114
115
|
raise Exceptions::NonGetAsyncRequestNotYetImplemented.new "the #continue_with method currently requires the 'async' option to be set to false" if @options[:async]
|
115
116
|
|
116
117
|
@continue_clause_args = extractor_args
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: extraloop
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.6
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-01-
|
12
|
+
date: 2012-01-30 00:00:00.000000000Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: yajl-ruby
|
16
|
-
requirement: &
|
16
|
+
requirement: &15579720 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ~>
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: 1.1.0
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *15579720
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: nokogiri
|
27
|
-
requirement: &
|
27
|
+
requirement: &15579260 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ~>
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: 1.5.0
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *15579260
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: typhoeus
|
38
|
-
requirement: &
|
38
|
+
requirement: &15578800 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ~>
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: 0.3.2
|
44
44
|
type: :runtime
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *15578800
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: logging
|
49
|
-
requirement: &
|
49
|
+
requirement: &15578340 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ~>
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: 0.6.1
|
55
55
|
type: :runtime
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *15578340
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: rspec
|
60
|
-
requirement: &
|
60
|
+
requirement: &15577880 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ~>
|
@@ -65,10 +65,10 @@ dependencies:
|
|
65
65
|
version: 2.7.0
|
66
66
|
type: :development
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *15577880
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: rr
|
71
|
-
requirement: &
|
71
|
+
requirement: &15577420 !ruby/object:Gem::Requirement
|
72
72
|
none: false
|
73
73
|
requirements:
|
74
74
|
- - ~>
|
@@ -76,10 +76,10 @@ dependencies:
|
|
76
76
|
version: 1.0.4
|
77
77
|
type: :development
|
78
78
|
prerelease: false
|
79
|
-
version_requirements: *
|
79
|
+
version_requirements: *15577420
|
80
80
|
- !ruby/object:Gem::Dependency
|
81
81
|
name: pry
|
82
|
-
requirement: &
|
82
|
+
requirement: &15576960 !ruby/object:Gem::Requirement
|
83
83
|
none: false
|
84
84
|
requirements:
|
85
85
|
- - ~>
|
@@ -87,7 +87,7 @@ dependencies:
|
|
87
87
|
version: 0.9.7.4
|
88
88
|
type: :development
|
89
89
|
prerelease: false
|
90
|
-
version_requirements: *
|
90
|
+
version_requirements: *15576960
|
91
91
|
description: A Ruby library for extracting data from websites and web based APIs.
|
92
92
|
Supports most common document formats (i.e. HTML, XML, and JSON), and comes with
|
93
93
|
a handy mechanism for iterating over paginated datasets.
|