extraloop 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +7 -1
- data/README.md +2 -2
- data/examples/google_news_scraper.rb +1 -1
- data/examples/wikipedia_categories.rb +1 -1
- data/lib/extraloop.rb +18 -16
- data/lib/extraloop/dom_extractor.rb +38 -36
- data/lib/extraloop/extraction_environment.rb +16 -14
- data/lib/extraloop/extraction_loop.rb +37 -37
- data/lib/extraloop/extractor_base.rb +34 -33
- data/lib/extraloop/hookable.rb +18 -18
- data/lib/extraloop/iterative_scraper.rb +249 -250
- data/lib/extraloop/json_extractor.rb +27 -26
- data/lib/extraloop/loggable.rb +50 -49
- data/lib/extraloop/scraper_base.rb +144 -141
- data/lib/extraloop/utils.rb +64 -61
- data/spec/helpers/spec_helper.rb +2 -1
- metadata +24 -13
data/lib/extraloop/utils.rb
CHANGED
@@ -1,75 +1,78 @@
|
|
1
|
-
module
|
2
|
-
module
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
1
|
+
module ExtraLoop
|
2
|
+
module Utils
|
3
|
+
module ScrapingHelpers
|
4
|
+
#
|
5
|
+
# Generates a proc that iterates over a list of anchors
|
6
|
+
# and collects the value of the specified paramenter
|
7
|
+
#
|
8
|
+
def values_for_param(param)
|
9
|
+
lambda { |nodeList|
|
10
|
+
nodeList.collect {|node|
|
11
|
+
query = URI::parse(node.attr(:href)).query
|
12
|
+
query.split("&").collect { |token| token.split("=") }.
|
13
|
+
detect{ |chunks| chunks.first == param.to_s }.last
|
14
|
+
}.uniq
|
15
|
+
}
|
16
|
+
end
|
15
17
|
end
|
16
|
-
end
|
17
18
|
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
19
|
+
module URIAddition
|
20
|
+
#
|
21
|
+
# Public
|
22
|
+
#
|
23
|
+
# Generates a hash representation of a uri's query string.
|
24
|
+
#
|
25
|
+
# Returns a hash mapping the URL query parameters to their respective values
|
26
|
+
#
|
27
|
+
# NOTE: this is intended as a decorator method for instances of URI::HTTP.
|
28
|
+
#
|
29
|
+
# examples:
|
30
|
+
#
|
31
|
+
# URI::parse(url).extend(URIAddition).query_hash
|
32
|
+
#
|
32
33
|
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
34
|
+
def query_hash
|
35
|
+
return unless self.query
|
36
|
+
self.query.split("&").reduce({}) do |memo, item|
|
37
|
+
param, value = *item.split("=")
|
38
|
+
memo.merge(param.to_sym => value)
|
39
|
+
end
|
38
40
|
end
|
39
41
|
end
|
40
|
-
end
|
41
42
|
|
42
|
-
|
43
|
-
|
44
|
-
|
43
|
+
module DeepFetchable
|
44
|
+
def get_in(path)
|
45
|
+
keys, node = Array(path), self
|
45
46
|
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
47
|
+
keys.each_with_index do |key, index|
|
48
|
+
node = node[key]
|
49
|
+
next_key = keys[index + 1]
|
50
|
+
break unless node
|
51
|
+
end
|
51
52
|
|
52
|
-
|
53
|
+
node
|
54
|
+
end
|
53
55
|
end
|
54
|
-
end
|
55
56
|
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
57
|
+
module Support
|
58
|
+
def symbolize_keys(hash)
|
59
|
+
hash.reduce({}) { |memo, (k,v)| memo.merge(k => v) }
|
60
|
+
end
|
61
|
+
#
|
62
|
+
# Creates instance variables from a hash.
|
63
|
+
#
|
64
|
+
# hash - An hash representing of instance variables to be created.
|
65
|
+
# defaults - An hash representing the attributes' default values (optional).
|
66
|
+
#
|
67
|
+
protected
|
68
|
+
def set_attributes(hash, defaults={})
|
69
|
+
allowed = defaults.keys
|
70
|
+
hash.each { |key, value| self.instance_variable_set("@#{key}", value)}
|
71
|
+
defaults.each do |key, value|
|
72
|
+
self.instance_variable_set("@#{key}", value) unless self.instance_variable_get("@#{key}")
|
73
|
+
end
|
72
74
|
end
|
73
75
|
end
|
74
76
|
end
|
77
|
+
|
75
78
|
end
|
data/spec/helpers/spec_helper.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: extraloop
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -13,7 +13,7 @@ date: 2012-01-01 00:00:00.000000000Z
|
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: yajl-ruby
|
16
|
-
requirement: &
|
16
|
+
requirement: &10243900 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ~>
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: 1.1.0
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *10243900
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: nokogiri
|
27
|
-
requirement: &
|
27
|
+
requirement: &10242520 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ~>
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: 1.5.0
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *10242520
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: typhoeus
|
38
|
-
requirement: &
|
38
|
+
requirement: &10240780 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ~>
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: 0.3.2
|
44
44
|
type: :runtime
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *10240780
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: logging
|
49
|
-
requirement: &
|
49
|
+
requirement: &10238820 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ~>
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: 0.6.1
|
55
55
|
type: :runtime
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *10238820
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: rspec
|
60
|
-
requirement: &
|
60
|
+
requirement: &10233640 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ~>
|
@@ -65,10 +65,10 @@ dependencies:
|
|
65
65
|
version: 2.7.1
|
66
66
|
type: :development
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *10233640
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: rr
|
71
|
-
requirement: &
|
71
|
+
requirement: &10231680 !ruby/object:Gem::Requirement
|
72
72
|
none: false
|
73
73
|
requirements:
|
74
74
|
- - ~>
|
@@ -76,7 +76,18 @@ dependencies:
|
|
76
76
|
version: 1.0.4
|
77
77
|
type: :development
|
78
78
|
prerelease: false
|
79
|
-
version_requirements: *
|
79
|
+
version_requirements: *10231680
|
80
|
+
- !ruby/object:Gem::Dependency
|
81
|
+
name: pry
|
82
|
+
requirement: &10229180 !ruby/object:Gem::Requirement
|
83
|
+
none: false
|
84
|
+
requirements:
|
85
|
+
- - ~>
|
86
|
+
- !ruby/object:Gem::Version
|
87
|
+
version: 0.9.7.4
|
88
|
+
type: :development
|
89
|
+
prerelease: false
|
90
|
+
version_requirements: *10229180
|
80
91
|
description: A Ruby library for extracting data from websites and web based APIs.
|
81
92
|
Supports most common document formats (i.e. HTML, XML, and JSON), and comes with
|
82
93
|
a handy mechanism for iterating over paginated datasets.
|