scrubyt 0.3.0 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
metadata CHANGED
@@ -3,9 +3,9 @@ rubygems_version: 0.9.0
3
3
  specification_version: 1
4
4
  name: scrubyt
5
5
  version: !ruby/object:Gem::Version
6
- version: 0.3.0
7
- date: 2007-05-28 00:00:00 +02:00
8
- summary: A powerful Web-scraping framework
6
+ version: 0.3.4
7
+ date: 2007-09-26 00:00:00 +02:00
8
+ summary: A powerful Web-scraping framework built on Mechanize and Hpricot
9
9
  require_paths:
10
10
  - lib
11
11
  email: peter@rubyrailways.com
@@ -15,7 +15,7 @@ description: scRUBYt! is an easy to learn and use, yet powerful and effective we
15
15
  autorequire:
16
16
  default_executable:
17
17
  bindir: bin
18
- has_rdoc: "true"
18
+ has_rdoc: false
19
19
  required_ruby_version: !ruby/object:Gem::Version::Requirement
20
20
  requirements:
21
21
  - - ">"
@@ -61,9 +61,10 @@ files:
61
61
  - lib/scrubyt/core/scraping/filters/html_subtree_filter.rb
62
62
  - lib/scrubyt/core/scraping/filters/detail_page_filter.rb
63
63
  - lib/scrubyt/core/scraping/filters/download_filter.rb
64
- - lib/scrubyt/core/shared/u_r_i_builder.rb
64
+ - lib/scrubyt/core/scraping/filters/text_filter.rb
65
+ - lib/scrubyt/core/scraping/filters/constant_filter.rb
66
+ - lib/scrubyt/core/scraping/filters/script_filter.rb
65
67
  - lib/scrubyt/core/shared/extractor.rb
66
- - lib/scrubyt/core/shared/evaluation_context.rb
67
68
  test_files: []
68
69
 
69
70
  rdoc_options: []
@@ -95,3 +96,48 @@ dependencies:
95
96
  - !ruby/object:Gem::Version
96
97
  version: 0.6.3
97
98
  version:
99
+ - !ruby/object:Gem::Dependency
100
+ name: ParseTreeReloaded
101
+ version_requirement:
102
+ version_requirements: !ruby/object:Gem::Version::Requirement
103
+ requirements:
104
+ - - ">"
105
+ - !ruby/object:Gem::Version
106
+ version: 0.0.0
107
+ version:
108
+ - !ruby/object:Gem::Dependency
109
+ name: RubyInlineAcceleration
110
+ version_requirement:
111
+ version_requirements: !ruby/object:Gem::Version::Requirement
112
+ requirements:
113
+ - - ">"
114
+ - !ruby/object:Gem::Version
115
+ version: 0.0.0
116
+ version:
117
+ - !ruby/object:Gem::Dependency
118
+ name: RubyInline
119
+ version_requirement:
120
+ version_requirements: !ruby/object:Gem::Version::Requirement
121
+ requirements:
122
+ - - "="
123
+ - !ruby/object:Gem::Version
124
+ version: 3.6.3
125
+ version:
126
+ - !ruby/object:Gem::Dependency
127
+ name: ParseTree
128
+ version_requirement:
129
+ version_requirements: !ruby/object:Gem::Version::Requirement
130
+ requirements:
131
+ - - "="
132
+ - !ruby/object:Gem::Version
133
+ version: 1.7.1
134
+ version:
135
+ - !ruby/object:Gem::Dependency
136
+ name: ruby2ruby
137
+ version_requirement:
138
+ version_requirements: !ruby/object:Gem::Version::Requirement
139
+ requirements:
140
+ - - "="
141
+ - !ruby/object:Gem::Version
142
+ version: 1.1.6
143
+ version:
@@ -1,57 +0,0 @@
1
- module Scrubyt
2
- ##
3
- #=<tt>Holding the evaluation context of the extraction process</tt>
4
- #
5
- #Every kind of data that is shared among patterns during the extraction process
6
- #is held in this class, so it can be looked up anytime.
7
- #
8
- #This class provides also some high-level basic functionality in navigation, like
9
- #crawling to new pages, attaching doucment to the root pattern once arrived at the
10
- #desired page etc.
11
- #
12
- #It can be viewed as a glue between Extractor and NavigationActions as well - these
13
- #two classes need to communicate frequently as well as share different information
14
- #and this is accomplished through EvaluationContext.
15
- class EvaluationContext
16
- attr_accessor :root_pattern, :document_index, :extractor, :uri_builder, :evaluating_extractor_definition
17
-
18
- def initialize
19
- @root_pattern = nil
20
- @next_page = nil
21
- @document_index = 0
22
- @extractor = nil
23
- @evaluating_extractor_definition = false
24
- end
25
-
26
- ##
27
- #Crawl to a new page. This function should not be called from the outside - it is automatically called
28
- #if the next_page pattern is defined
29
- def crawl_to_new_page(uri_builder)
30
- #puts "Crawling to new page!"
31
- #puts "example #{uri_builder.next_page_example}"
32
- temp_document = uri_builder.next_page_example ?
33
- generate_next_page_link(uri_builder) :
34
- uri_builder.generate_next_uri
35
- return false if temp_document == nil
36
- FetchAction.restore_host_name
37
- @extractor.fetch(temp_document)
38
- return true
39
- end
40
-
41
- def generate_next_page_link(uri_builder)
42
- return nil unless uri_builder.next_page_pattern.filters[0].generate_XPath_for_example(true)
43
- xpath = uri_builder.next_page_pattern.filters[0].xpath
44
- node = (@extractor.get_hpricot_doc/xpath).map.last
45
- node = XPathUtils.find_nearest_node_with_attribute(node, 'href')
46
- return nil if node == nil || node.attributes['href'] == nil
47
- node.attributes['href'].gsub('&amp;') {'&'}
48
- end
49
-
50
- def setup_uri_builder(pattern,args)
51
- if args[0] =~ /^http.+/
52
- args.insert(0, @extractor.get_current_doc_url) if args[1] !~ /^http.+/
53
- end
54
- @uri_builder = URIBuilder.new(pattern,args)
55
- end
56
- end #end of class EvaluationContext
57
- end #end of module Scrubyt
@@ -1,67 +0,0 @@
1
- module Scrubyt
2
- ##
3
- #=<tt>Build URIs from different parameters</tt>
4
- #
5
- #When crawling to further pages which are machine-generated
6
- #(most typically "next" pages) we need to detect the pattern
7
- #and generate the next URI based on the edetected rule. This
8
- #class provides methods to build URIs based on different criteria.
9
- #
10
- #The other possibility is to use constant objects ('Next' links,
11
- #or image links (like right arrow) pointing to the next page).
12
- #URIBUilder supports both possibilities.
13
- class URIBuilder
14
- attr_reader :next_page_example, :next_page_pattern, :limit, :next_param, :next_increment, :increment, :current_uri
15
-
16
- def initialize(pattern,args)
17
- if args[0] =~ /^http.+/
18
- #Figure out how are the URLs generated based on the next URL
19
- get_next_param(string_diff(args[0], args[1]))
20
- @increment = 0
21
- @current_uri = args[1]
22
- @limit = args[2][:limit] if args.size > 2
23
- else
24
- #Otherwise, do this in the 'classic' way (by clicking on the "next" link)
25
- @next_page_pattern = pattern
26
- @next_page_example = args[0]
27
- @limit = args[1][:limit] if args.size > 1
28
- end
29
- end
30
-
31
- #Used when generating the next URI (as opposed to 'clicking' the next link)
32
- def generate_next_uri
33
- @increment += @next_increment
34
- return @current_uri if @increment == @next_increment
35
- @next_increment = 1 if @next_increment == 2
36
- if @current_uri !~ /#{@next_param}/
37
- @current_uri += (@next_param + '=' + @next_increment.to_s)
38
- else
39
- @current_uri = @current_uri.sub(/#{@next_param}=#{@increment-@next_increment}/) do
40
- "#{@next_param}=#{@increment}"
41
- end
42
- end
43
- end
44
-
45
- private
46
- def get_next_param(pair)
47
- param_and_value = pair.split('=')
48
- @next_param = param_and_value[0]
49
- @next_increment = param_and_value[1].to_i
50
- end
51
-
52
- def find_difference_index(s1,s2)
53
- cmp = s2.scan(/./).zip(s1.scan(/./))
54
- i = 0
55
- loop do
56
- return i if cmp[i][0] != cmp[i][1]
57
- i+=1
58
- end
59
- end
60
-
61
- def string_diff(s1,s2)
62
- s2[find_difference_index(s1, s2)..s2.size-find_difference_index(s1.reverse, s2.reverse)-1]
63
- end #end of method string_diff
64
- end #end of class URIBuilder
65
- end #end of module Scrubyt
66
-
67
-