scrubyt 0.3.0 → 0.3.4

Sign up to get free protection for your applications and to get access to all the features.
metadata CHANGED
@@ -3,9 +3,9 @@ rubygems_version: 0.9.0
3
3
  specification_version: 1
4
4
  name: scrubyt
5
5
  version: !ruby/object:Gem::Version
6
- version: 0.3.0
7
- date: 2007-05-28 00:00:00 +02:00
8
- summary: A powerful Web-scraping framework
6
+ version: 0.3.4
7
+ date: 2007-09-26 00:00:00 +02:00
8
+ summary: A powerful Web-scraping framework built on Mechanize and Hpricot
9
9
  require_paths:
10
10
  - lib
11
11
  email: peter@rubyrailways.com
@@ -15,7 +15,7 @@ description: scRUBYt! is an easy to learn and use, yet powerful and effective we
15
15
  autorequire:
16
16
  default_executable:
17
17
  bindir: bin
18
- has_rdoc: "true"
18
+ has_rdoc: false
19
19
  required_ruby_version: !ruby/object:Gem::Version::Requirement
20
20
  requirements:
21
21
  - - ">"
@@ -61,9 +61,10 @@ files:
61
61
  - lib/scrubyt/core/scraping/filters/html_subtree_filter.rb
62
62
  - lib/scrubyt/core/scraping/filters/detail_page_filter.rb
63
63
  - lib/scrubyt/core/scraping/filters/download_filter.rb
64
- - lib/scrubyt/core/shared/u_r_i_builder.rb
64
+ - lib/scrubyt/core/scraping/filters/text_filter.rb
65
+ - lib/scrubyt/core/scraping/filters/constant_filter.rb
66
+ - lib/scrubyt/core/scraping/filters/script_filter.rb
65
67
  - lib/scrubyt/core/shared/extractor.rb
66
- - lib/scrubyt/core/shared/evaluation_context.rb
67
68
  test_files: []
68
69
 
69
70
  rdoc_options: []
@@ -95,3 +96,48 @@ dependencies:
95
96
  - !ruby/object:Gem::Version
96
97
  version: 0.6.3
97
98
  version:
99
+ - !ruby/object:Gem::Dependency
100
+ name: ParseTreeReloaded
101
+ version_requirement:
102
+ version_requirements: !ruby/object:Gem::Version::Requirement
103
+ requirements:
104
+ - - ">"
105
+ - !ruby/object:Gem::Version
106
+ version: 0.0.0
107
+ version:
108
+ - !ruby/object:Gem::Dependency
109
+ name: RubyInlineAcceleration
110
+ version_requirement:
111
+ version_requirements: !ruby/object:Gem::Version::Requirement
112
+ requirements:
113
+ - - ">"
114
+ - !ruby/object:Gem::Version
115
+ version: 0.0.0
116
+ version:
117
+ - !ruby/object:Gem::Dependency
118
+ name: RubyInline
119
+ version_requirement:
120
+ version_requirements: !ruby/object:Gem::Version::Requirement
121
+ requirements:
122
+ - - "="
123
+ - !ruby/object:Gem::Version
124
+ version: 3.6.3
125
+ version:
126
+ - !ruby/object:Gem::Dependency
127
+ name: ParseTree
128
+ version_requirement:
129
+ version_requirements: !ruby/object:Gem::Version::Requirement
130
+ requirements:
131
+ - - "="
132
+ - !ruby/object:Gem::Version
133
+ version: 1.7.1
134
+ version:
135
+ - !ruby/object:Gem::Dependency
136
+ name: ruby2ruby
137
+ version_requirement:
138
+ version_requirements: !ruby/object:Gem::Version::Requirement
139
+ requirements:
140
+ - - "="
141
+ - !ruby/object:Gem::Version
142
+ version: 1.1.6
143
+ version:
@@ -1,57 +0,0 @@
1
- module Scrubyt
2
- ##
3
- #=<tt>Holding the evaluation context of the extraction process</tt>
4
- #
5
- #Every kind of data that is shared among patterns during the extraction process
6
- #is held in this class, so it can be looked up anytime.
7
- #
8
- #This class provides also some high-level basic functionality in navigation, like
9
- #crawling to new pages, attaching doucment to the root pattern once arrived at the
10
- #desired page etc.
11
- #
12
- #It can be viewed as a glue between Extractor and NavigationActions as well - these
13
- #two classes need to communicate frequently as well as share different information
14
- #and this is accomplished through EvaluationContext.
15
- class EvaluationContext
16
- attr_accessor :root_pattern, :document_index, :extractor, :uri_builder, :evaluating_extractor_definition
17
-
18
- def initialize
19
- @root_pattern = nil
20
- @next_page = nil
21
- @document_index = 0
22
- @extractor = nil
23
- @evaluating_extractor_definition = false
24
- end
25
-
26
- ##
27
- #Crawl to a new page. This function should not be called from the outside - it is automatically called
28
- #if the next_page pattern is defined
29
- def crawl_to_new_page(uri_builder)
30
- #puts "Crawling to new page!"
31
- #puts "example #{uri_builder.next_page_example}"
32
- temp_document = uri_builder.next_page_example ?
33
- generate_next_page_link(uri_builder) :
34
- uri_builder.generate_next_uri
35
- return false if temp_document == nil
36
- FetchAction.restore_host_name
37
- @extractor.fetch(temp_document)
38
- return true
39
- end
40
-
41
- def generate_next_page_link(uri_builder)
42
- return nil unless uri_builder.next_page_pattern.filters[0].generate_XPath_for_example(true)
43
- xpath = uri_builder.next_page_pattern.filters[0].xpath
44
- node = (@extractor.get_hpricot_doc/xpath).map.last
45
- node = XPathUtils.find_nearest_node_with_attribute(node, 'href')
46
- return nil if node == nil || node.attributes['href'] == nil
47
- node.attributes['href'].gsub('&amp;') {'&'}
48
- end
49
-
50
- def setup_uri_builder(pattern,args)
51
- if args[0] =~ /^http.+/
52
- args.insert(0, @extractor.get_current_doc_url) if args[1] !~ /^http.+/
53
- end
54
- @uri_builder = URIBuilder.new(pattern,args)
55
- end
56
- end #end of class EvaluationContext
57
- end #end of module Scrubyt
@@ -1,67 +0,0 @@
1
- module Scrubyt
2
- ##
3
- #=<tt>Build URIs from different parameters</tt>
4
- #
5
- #When crawling to further pages which are machine-generated
6
- #(most typically "next" pages) we need to detect the pattern
7
- #and generate the next URI based on the edetected rule. This
8
- #class provides methods to build URIs based on different criteria.
9
- #
10
- #The other possibility is to use constant objects ('Next' links,
11
- #or image links (like right arrow) pointing to the next page).
12
- #URIBUilder supports both possibilities.
13
- class URIBuilder
14
- attr_reader :next_page_example, :next_page_pattern, :limit, :next_param, :next_increment, :increment, :current_uri
15
-
16
- def initialize(pattern,args)
17
- if args[0] =~ /^http.+/
18
- #Figure out how are the URLs generated based on the next URL
19
- get_next_param(string_diff(args[0], args[1]))
20
- @increment = 0
21
- @current_uri = args[1]
22
- @limit = args[2][:limit] if args.size > 2
23
- else
24
- #Otherwise, do this in the 'classic' way (by clicking on the "next" link)
25
- @next_page_pattern = pattern
26
- @next_page_example = args[0]
27
- @limit = args[1][:limit] if args.size > 1
28
- end
29
- end
30
-
31
- #Used when generating the next URI (as opposed to 'clicking' the next link)
32
- def generate_next_uri
33
- @increment += @next_increment
34
- return @current_uri if @increment == @next_increment
35
- @next_increment = 1 if @next_increment == 2
36
- if @current_uri !~ /#{@next_param}/
37
- @current_uri += (@next_param + '=' + @next_increment.to_s)
38
- else
39
- @current_uri = @current_uri.sub(/#{@next_param}=#{@increment-@next_increment}/) do
40
- "#{@next_param}=#{@increment}"
41
- end
42
- end
43
- end
44
-
45
- private
46
- def get_next_param(pair)
47
- param_and_value = pair.split('=')
48
- @next_param = param_and_value[0]
49
- @next_increment = param_and_value[1].to_i
50
- end
51
-
52
- def find_difference_index(s1,s2)
53
- cmp = s2.scan(/./).zip(s1.scan(/./))
54
- i = 0
55
- loop do
56
- return i if cmp[i][0] != cmp[i][1]
57
- i+=1
58
- end
59
- end
60
-
61
- def string_diff(s1,s2)
62
- s2[find_difference_index(s1, s2)..s2.size-find_difference_index(s1.reverse, s2.reverse)-1]
63
- end #end of method string_diff
64
- end #end of class URIBuilder
65
- end #end of module Scrubyt
66
-
67
-