scrubyt 0.3.0 → 0.3.4
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +13 -6
- data/Rakefile +22 -10
- data/lib/scrubyt.rb +9 -4
- data/lib/scrubyt/core/navigation/fetch_action.rb +14 -11
- data/lib/scrubyt/core/navigation/navigation_actions.rb +47 -72
- data/lib/scrubyt/core/scraping/filters/base_filter.rb +3 -2
- data/lib/scrubyt/core/scraping/filters/constant_filter.rb +12 -0
- data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb +31 -8
- data/lib/scrubyt/core/scraping/filters/download_filter.rb +29 -11
- data/lib/scrubyt/core/scraping/filters/regexp_filter.rb +5 -5
- data/lib/scrubyt/core/scraping/filters/script_filter.rb +14 -0
- data/lib/scrubyt/core/scraping/filters/text_filter.rb +38 -0
- data/lib/scrubyt/core/scraping/filters/tree_filter.rb +5 -5
- data/lib/scrubyt/core/scraping/pattern.rb +25 -18
- data/lib/scrubyt/core/shared/extractor.rb +109 -128
- data/lib/scrubyt/logging.rb +146 -8
- data/lib/scrubyt/output/export.rb +60 -44
- data/lib/scrubyt/output/result_node.rb +34 -3
- data/lib/scrubyt/output/scrubyt_result.rb +18 -9
- data/lib/scrubyt/utils/compound_example_lookup.rb +1 -1
- data/lib/scrubyt/utils/shared_utils.rb +1 -1
- data/lib/scrubyt/utils/simple_example_lookup.rb +9 -5
- metadata +52 -6
- data/lib/scrubyt/core/shared/evaluation_context.rb +0 -57
- data/lib/scrubyt/core/shared/u_r_i_builder.rb +0 -67
metadata
CHANGED
@@ -3,9 +3,9 @@ rubygems_version: 0.9.0
|
|
3
3
|
specification_version: 1
|
4
4
|
name: scrubyt
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.3.
|
7
|
-
date: 2007-
|
8
|
-
summary: A powerful Web-scraping framework
|
6
|
+
version: 0.3.4
|
7
|
+
date: 2007-09-26 00:00:00 +02:00
|
8
|
+
summary: A powerful Web-scraping framework built on Mechanize and Hpricot
|
9
9
|
require_paths:
|
10
10
|
- lib
|
11
11
|
email: peter@rubyrailways.com
|
@@ -15,7 +15,7 @@ description: scRUBYt! is an easy to learn and use, yet powerful and effective we
|
|
15
15
|
autorequire:
|
16
16
|
default_executable:
|
17
17
|
bindir: bin
|
18
|
-
has_rdoc:
|
18
|
+
has_rdoc: false
|
19
19
|
required_ruby_version: !ruby/object:Gem::Version::Requirement
|
20
20
|
requirements:
|
21
21
|
- - ">"
|
@@ -61,9 +61,10 @@ files:
|
|
61
61
|
- lib/scrubyt/core/scraping/filters/html_subtree_filter.rb
|
62
62
|
- lib/scrubyt/core/scraping/filters/detail_page_filter.rb
|
63
63
|
- lib/scrubyt/core/scraping/filters/download_filter.rb
|
64
|
-
- lib/scrubyt/core/
|
64
|
+
- lib/scrubyt/core/scraping/filters/text_filter.rb
|
65
|
+
- lib/scrubyt/core/scraping/filters/constant_filter.rb
|
66
|
+
- lib/scrubyt/core/scraping/filters/script_filter.rb
|
65
67
|
- lib/scrubyt/core/shared/extractor.rb
|
66
|
-
- lib/scrubyt/core/shared/evaluation_context.rb
|
67
68
|
test_files: []
|
68
69
|
|
69
70
|
rdoc_options: []
|
@@ -95,3 +96,48 @@ dependencies:
|
|
95
96
|
- !ruby/object:Gem::Version
|
96
97
|
version: 0.6.3
|
97
98
|
version:
|
99
|
+
- !ruby/object:Gem::Dependency
|
100
|
+
name: ParseTreeReloaded
|
101
|
+
version_requirement:
|
102
|
+
version_requirements: !ruby/object:Gem::Version::Requirement
|
103
|
+
requirements:
|
104
|
+
- - ">"
|
105
|
+
- !ruby/object:Gem::Version
|
106
|
+
version: 0.0.0
|
107
|
+
version:
|
108
|
+
- !ruby/object:Gem::Dependency
|
109
|
+
name: RubyInlineAcceleration
|
110
|
+
version_requirement:
|
111
|
+
version_requirements: !ruby/object:Gem::Version::Requirement
|
112
|
+
requirements:
|
113
|
+
- - ">"
|
114
|
+
- !ruby/object:Gem::Version
|
115
|
+
version: 0.0.0
|
116
|
+
version:
|
117
|
+
- !ruby/object:Gem::Dependency
|
118
|
+
name: RubyInline
|
119
|
+
version_requirement:
|
120
|
+
version_requirements: !ruby/object:Gem::Version::Requirement
|
121
|
+
requirements:
|
122
|
+
- - "="
|
123
|
+
- !ruby/object:Gem::Version
|
124
|
+
version: 3.6.3
|
125
|
+
version:
|
126
|
+
- !ruby/object:Gem::Dependency
|
127
|
+
name: ParseTree
|
128
|
+
version_requirement:
|
129
|
+
version_requirements: !ruby/object:Gem::Version::Requirement
|
130
|
+
requirements:
|
131
|
+
- - "="
|
132
|
+
- !ruby/object:Gem::Version
|
133
|
+
version: 1.7.1
|
134
|
+
version:
|
135
|
+
- !ruby/object:Gem::Dependency
|
136
|
+
name: ruby2ruby
|
137
|
+
version_requirement:
|
138
|
+
version_requirements: !ruby/object:Gem::Version::Requirement
|
139
|
+
requirements:
|
140
|
+
- - "="
|
141
|
+
- !ruby/object:Gem::Version
|
142
|
+
version: 1.1.6
|
143
|
+
version:
|
@@ -1,57 +0,0 @@
|
|
1
|
-
module Scrubyt
|
2
|
-
##
|
3
|
-
#=<tt>Holding the evaluation context of the extraction process</tt>
|
4
|
-
#
|
5
|
-
#Every kind of data that is shared among patterns during the extraction process
|
6
|
-
#is held in this class, so it can be looked up anytime.
|
7
|
-
#
|
8
|
-
#This class provides also some high-level basic functionality in navigation, like
|
9
|
-
#crawling to new pages, attaching doucment to the root pattern once arrived at the
|
10
|
-
#desired page etc.
|
11
|
-
#
|
12
|
-
#It can be viewed as a glue between Extractor and NavigationActions as well - these
|
13
|
-
#two classes need to communicate frequently as well as share different information
|
14
|
-
#and this is accomplished through EvaluationContext.
|
15
|
-
class EvaluationContext
|
16
|
-
attr_accessor :root_pattern, :document_index, :extractor, :uri_builder, :evaluating_extractor_definition
|
17
|
-
|
18
|
-
def initialize
|
19
|
-
@root_pattern = nil
|
20
|
-
@next_page = nil
|
21
|
-
@document_index = 0
|
22
|
-
@extractor = nil
|
23
|
-
@evaluating_extractor_definition = false
|
24
|
-
end
|
25
|
-
|
26
|
-
##
|
27
|
-
#Crawl to a new page. This function should not be called from the outside - it is automatically called
|
28
|
-
#if the next_page pattern is defined
|
29
|
-
def crawl_to_new_page(uri_builder)
|
30
|
-
#puts "Crawling to new page!"
|
31
|
-
#puts "example #{uri_builder.next_page_example}"
|
32
|
-
temp_document = uri_builder.next_page_example ?
|
33
|
-
generate_next_page_link(uri_builder) :
|
34
|
-
uri_builder.generate_next_uri
|
35
|
-
return false if temp_document == nil
|
36
|
-
FetchAction.restore_host_name
|
37
|
-
@extractor.fetch(temp_document)
|
38
|
-
return true
|
39
|
-
end
|
40
|
-
|
41
|
-
def generate_next_page_link(uri_builder)
|
42
|
-
return nil unless uri_builder.next_page_pattern.filters[0].generate_XPath_for_example(true)
|
43
|
-
xpath = uri_builder.next_page_pattern.filters[0].xpath
|
44
|
-
node = (@extractor.get_hpricot_doc/xpath).map.last
|
45
|
-
node = XPathUtils.find_nearest_node_with_attribute(node, 'href')
|
46
|
-
return nil if node == nil || node.attributes['href'] == nil
|
47
|
-
node.attributes['href'].gsub('&') {'&'}
|
48
|
-
end
|
49
|
-
|
50
|
-
def setup_uri_builder(pattern,args)
|
51
|
-
if args[0] =~ /^http.+/
|
52
|
-
args.insert(0, @extractor.get_current_doc_url) if args[1] !~ /^http.+/
|
53
|
-
end
|
54
|
-
@uri_builder = URIBuilder.new(pattern,args)
|
55
|
-
end
|
56
|
-
end #end of class EvaluationContext
|
57
|
-
end #end of module Scrubyt
|
@@ -1,67 +0,0 @@
|
|
1
|
-
module Scrubyt
|
2
|
-
##
|
3
|
-
#=<tt>Build URIs from different parameters</tt>
|
4
|
-
#
|
5
|
-
#When crawling to further pages which are machine-generated
|
6
|
-
#(most typically "next" pages) we need to detect the pattern
|
7
|
-
#and generate the next URI based on the edetected rule. This
|
8
|
-
#class provides methods to build URIs based on different criteria.
|
9
|
-
#
|
10
|
-
#The other possibility is to use constant objects ('Next' links,
|
11
|
-
#or image links (like right arrow) pointing to the next page).
|
12
|
-
#URIBUilder supports both possibilities.
|
13
|
-
class URIBuilder
|
14
|
-
attr_reader :next_page_example, :next_page_pattern, :limit, :next_param, :next_increment, :increment, :current_uri
|
15
|
-
|
16
|
-
def initialize(pattern,args)
|
17
|
-
if args[0] =~ /^http.+/
|
18
|
-
#Figure out how are the URLs generated based on the next URL
|
19
|
-
get_next_param(string_diff(args[0], args[1]))
|
20
|
-
@increment = 0
|
21
|
-
@current_uri = args[1]
|
22
|
-
@limit = args[2][:limit] if args.size > 2
|
23
|
-
else
|
24
|
-
#Otherwise, do this in the 'classic' way (by clicking on the "next" link)
|
25
|
-
@next_page_pattern = pattern
|
26
|
-
@next_page_example = args[0]
|
27
|
-
@limit = args[1][:limit] if args.size > 1
|
28
|
-
end
|
29
|
-
end
|
30
|
-
|
31
|
-
#Used when generating the next URI (as opposed to 'clicking' the next link)
|
32
|
-
def generate_next_uri
|
33
|
-
@increment += @next_increment
|
34
|
-
return @current_uri if @increment == @next_increment
|
35
|
-
@next_increment = 1 if @next_increment == 2
|
36
|
-
if @current_uri !~ /#{@next_param}/
|
37
|
-
@current_uri += (@next_param + '=' + @next_increment.to_s)
|
38
|
-
else
|
39
|
-
@current_uri = @current_uri.sub(/#{@next_param}=#{@increment-@next_increment}/) do
|
40
|
-
"#{@next_param}=#{@increment}"
|
41
|
-
end
|
42
|
-
end
|
43
|
-
end
|
44
|
-
|
45
|
-
private
|
46
|
-
def get_next_param(pair)
|
47
|
-
param_and_value = pair.split('=')
|
48
|
-
@next_param = param_and_value[0]
|
49
|
-
@next_increment = param_and_value[1].to_i
|
50
|
-
end
|
51
|
-
|
52
|
-
def find_difference_index(s1,s2)
|
53
|
-
cmp = s2.scan(/./).zip(s1.scan(/./))
|
54
|
-
i = 0
|
55
|
-
loop do
|
56
|
-
return i if cmp[i][0] != cmp[i][1]
|
57
|
-
i+=1
|
58
|
-
end
|
59
|
-
end
|
60
|
-
|
61
|
-
def string_diff(s1,s2)
|
62
|
-
s2[find_difference_index(s1, s2)..s2.size-find_difference_index(s1.reverse, s2.reverse)-1]
|
63
|
-
end #end of method string_diff
|
64
|
-
end #end of class URIBuilder
|
65
|
-
end #end of module Scrubyt
|
66
|
-
|
67
|
-
|