extraloop 0.0.5 → 0.0.6
Sign up to get free protection for your applications and to get access to all the features.
@@ -4,7 +4,7 @@ results = []
|
|
4
4
|
|
5
5
|
ExtraLoop::IterativeScraper.new("https://www.google.com/search?tbm=nws&q=Egypt", :log => {
|
6
6
|
:log_level => :debug,
|
7
|
-
:appenders => [Logging.appenders.stderr ]
|
7
|
+
:appenders => [ Logging.appenders.stderr ]
|
8
8
|
|
9
9
|
}).set_iteration(:start, (1..101).step(10)).
|
10
10
|
loop_on("h3") { |nodes| nodes.map(&:parent) }.
|
@@ -23,7 +23,9 @@ options = {
|
|
23
23
|
:log_level => :info
|
24
24
|
}
|
25
25
|
}
|
26
|
-
request_arguments = { :params => params
|
26
|
+
request_arguments = { :params => params, :headers => {
|
27
|
+
"User-Agent" => "ExtraLoop - ruby data extraction toolkit: http://github.com/afiore/extraloop"
|
28
|
+
}}
|
27
29
|
|
28
30
|
|
29
31
|
#
|
@@ -39,9 +41,9 @@ ExtraLoop::IterativeScraper.new(api_url, options, request_arguments).
|
|
39
41
|
extract(:ns).
|
40
42
|
extract(:type).
|
41
43
|
extract(:timestamp).
|
42
|
-
on(:data
|
44
|
+
on(:data) do |results|
|
43
45
|
results.each { |record| all_results << record }
|
44
|
-
|
46
|
+
end.
|
45
47
|
continue_with(:cmcontinue, ['query-continue', 'categorymembers', 'cmcontinue']).
|
46
48
|
run()
|
47
49
|
|
@@ -29,7 +29,9 @@ class WikipediaCategoryScraper < ExtraLoop::IterativeScraper
|
|
29
29
|
:format => :json,
|
30
30
|
:log => false
|
31
31
|
}
|
32
|
-
request_arguments = { :params => params
|
32
|
+
request_arguments = { :params => params, :headers => {
|
33
|
+
"User-Agent" => "ExtraLoop - ruby data extraction toolkit: http://github.com/afiore/extraloop"
|
34
|
+
}}
|
33
35
|
|
34
36
|
super(@@api_url, options, request_arguments)
|
35
37
|
|
@@ -38,17 +40,16 @@ class WikipediaCategoryScraper < ExtraLoop::IterativeScraper
|
|
38
40
|
extract(:ns).
|
39
41
|
extract(:type).
|
40
42
|
extract(:timestamp).
|
41
|
-
on(:data
|
43
|
+
on(:data) do |results|
|
42
44
|
puts "#{"\t" * (@options[:depth] - 2).abs } #{@scraper.request_arguments[:params][:cmtitle]}"
|
43
45
|
categories = results.select{ |record| record.ns === 14 }.each { |category| results.delete(category) }
|
44
46
|
|
45
|
-
|
46
47
|
categories.each do |record|
|
47
48
|
# Instanciate a sub scraper if the current depth is greater than zero and the category member is a sub category.
|
48
49
|
WikipediaCategoryScraper.new(record.title, @options[:depth] - 1, @scraper.request_arguments[:params][:cmtitle] ).run unless @options[:depth] <= 0
|
49
50
|
end
|
50
51
|
|
51
|
-
|
52
|
+
end.
|
52
53
|
continue_with(:cmcontinue, ['query-continue', 'categorymembers', 'cmcontinue'])
|
53
54
|
end
|
54
55
|
end
|
@@ -89,8 +89,8 @@ module ExtraLoop
|
|
89
89
|
# Returns itself.
|
90
90
|
#
|
91
91
|
|
92
|
-
def set_iteration(param, *args)
|
93
|
-
|
92
|
+
def set_iteration(param, *args, &block)
|
93
|
+
args << block if block
|
94
94
|
if args.first.respond_to?(:map)
|
95
95
|
@iteration_set = Array(args.first).map &:to_s
|
96
96
|
else
|
@@ -110,7 +110,8 @@ module ExtraLoop
|
|
110
110
|
#
|
111
111
|
# Returns itself.
|
112
112
|
|
113
|
-
def continue_with(param, *extractor_args)
|
113
|
+
def continue_with(param, *extractor_args, &block)
|
114
|
+
extractor_args << block if block
|
114
115
|
raise Exceptions::NonGetAsyncRequestNotYetImplemented.new "the #continue_with method currently requires the 'async' option to be set to false" if @options[:async]
|
115
116
|
|
116
117
|
@continue_clause_args = extractor_args
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: extraloop
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.6
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-01-
|
12
|
+
date: 2012-01-30 00:00:00.000000000Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: yajl-ruby
|
16
|
-
requirement: &
|
16
|
+
requirement: &15579720 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ~>
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: 1.1.0
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *15579720
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: nokogiri
|
27
|
-
requirement: &
|
27
|
+
requirement: &15579260 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ~>
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: 1.5.0
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *15579260
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: typhoeus
|
38
|
-
requirement: &
|
38
|
+
requirement: &15578800 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ~>
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: 0.3.2
|
44
44
|
type: :runtime
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *15578800
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: logging
|
49
|
-
requirement: &
|
49
|
+
requirement: &15578340 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ~>
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: 0.6.1
|
55
55
|
type: :runtime
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *15578340
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: rspec
|
60
|
-
requirement: &
|
60
|
+
requirement: &15577880 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ~>
|
@@ -65,10 +65,10 @@ dependencies:
|
|
65
65
|
version: 2.7.0
|
66
66
|
type: :development
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *15577880
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: rr
|
71
|
-
requirement: &
|
71
|
+
requirement: &15577420 !ruby/object:Gem::Requirement
|
72
72
|
none: false
|
73
73
|
requirements:
|
74
74
|
- - ~>
|
@@ -76,10 +76,10 @@ dependencies:
|
|
76
76
|
version: 1.0.4
|
77
77
|
type: :development
|
78
78
|
prerelease: false
|
79
|
-
version_requirements: *
|
79
|
+
version_requirements: *15577420
|
80
80
|
- !ruby/object:Gem::Dependency
|
81
81
|
name: pry
|
82
|
-
requirement: &
|
82
|
+
requirement: &15576960 !ruby/object:Gem::Requirement
|
83
83
|
none: false
|
84
84
|
requirements:
|
85
85
|
- - ~>
|
@@ -87,7 +87,7 @@ dependencies:
|
|
87
87
|
version: 0.9.7.4
|
88
88
|
type: :development
|
89
89
|
prerelease: false
|
90
|
-
version_requirements: *
|
90
|
+
version_requirements: *15576960
|
91
91
|
description: A Ruby library for extracting data from websites and web based APIs.
|
92
92
|
Supports most common document formats (i.e. HTML, XML, and JSON), and comes with
|
93
93
|
a handy mechanism for iterating over paginated datasets.
|