scruber 0.1.3 → 0.1.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +21 -17
- data/lib/scruber/core/crawler.rb +4 -4
- data/lib/scruber/core/extensions/csv_output.rb +1 -1
- data/lib/scruber/helpers/dictionary_reader/csv.rb +1 -1
- data/lib/scruber/version.rb +1 -1
- data/lib/scruber.rb +4 -5
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8deee66960a3768ace0af72a5cb1eced62c90329
|
4
|
+
data.tar.gz: a0d3f330d8b838aee078f2d752226a1e5432b311
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 30df32ccd86afde913d47483e9f327b94869c52a21f7c1a43f442ef8a1f138a1500d0746d4641a4542937aa2dbba7e53e4697c95b234cfdc3d07eeb8ab3d13ed
|
7
|
+
data.tar.gz: 4e57023647a62f7f312a8a77b89097920e7ab6750c1fd6562d1b6b6b4b3ff239b484f08aed6920a259365100ba4efe9d3f168b1a873183a377c877261d49ba15
|
data/README.md
CHANGED
@@ -1,38 +1,42 @@
|
|
1
1
|
# Scruber
|
2
2
|
|
3
|
-
|
3
|
+
Scruber is an open source scraping framework for Ruby.
|
4
4
|
|
5
|
-
|
5
|
+
## Getting started
|
6
6
|
|
7
|
-
|
7
|
+
1. Install Scruber at the command prompt if you haven't yet:
|
8
8
|
|
9
|
-
|
9
|
+
$ gem install scruber
|
10
10
|
|
11
|
-
|
12
|
-
gem 'scruber'
|
13
|
-
```
|
11
|
+
2. Create a new workspace
|
14
12
|
|
15
|
-
|
13
|
+
$ scruber new myworkspace
|
16
14
|
|
17
|
-
|
15
|
+
3. Create a new scraper
|
18
16
|
|
19
|
-
|
17
|
+
$ scruber new scraper example
|
20
18
|
|
21
|
-
$ gem install scruber
|
22
19
|
|
23
|
-
|
20
|
+
```ruby
|
21
|
+
Scruber.run do
|
22
|
+
csv_file 'output.csv', col_sep: ','
|
23
|
+
|
24
|
+
get 'http://example.com'
|
24
25
|
|
25
|
-
|
26
|
+
parse :html do |page, html|
|
27
|
+
csv_out html.at('title').text
|
28
|
+
end
|
29
|
+
end
|
30
|
+
```
|
26
31
|
|
27
|
-
|
32
|
+
4. Run your scraper
|
28
33
|
|
29
|
-
|
34
|
+
$ scruber start example
|
30
35
|
|
31
|
-
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
32
36
|
|
33
37
|
## Contributing
|
34
38
|
|
35
|
-
Bug reports and pull requests are welcome on GitHub at https://github.com/
|
39
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/scruber/scruber.
|
36
40
|
|
37
41
|
## License
|
38
42
|
|
data/lib/scruber/core/crawler.rb
CHANGED
@@ -17,7 +17,7 @@ module Scruber
|
|
17
17
|
Scruber.configuration.merge_options(options)
|
18
18
|
@callbacks_options = {}
|
19
19
|
@callbacks = {}
|
20
|
-
@on_complete_callbacks =
|
20
|
+
@on_complete_callbacks = []
|
21
21
|
@queue = Scruber::Queue.new(scraper_name: scraper_name)
|
22
22
|
@fetcher = Scruber::Fetcher.new
|
23
23
|
load_extenstions
|
@@ -39,7 +39,7 @@ module Scruber
|
|
39
39
|
end
|
40
40
|
end
|
41
41
|
end
|
42
|
-
@on_complete_callbacks.each do |_,callback|
|
42
|
+
@on_complete_callbacks.sort_by{|c| -c[0] }.each do |(_,callback)|
|
43
43
|
instance_exec &(callback)
|
44
44
|
end
|
45
45
|
end
|
@@ -84,8 +84,8 @@ module Scruber
|
|
84
84
|
@callbacks[page_type.to_sym] = block
|
85
85
|
end
|
86
86
|
|
87
|
-
def
|
88
|
-
@on_complete_callbacks[
|
87
|
+
def on_complete(priority=1, &block)
|
88
|
+
@on_complete_callbacks.push [priority,block]
|
89
89
|
end
|
90
90
|
|
91
91
|
def process_page(page, page_type)
|
@@ -7,7 +7,7 @@ module Scruber
|
|
7
7
|
file_id = options.fetch(:file_id) { :default }.to_sym
|
8
8
|
options.delete(:file_id)
|
9
9
|
Scruber::Core::Extensions::CsvOutput.register_csv file_id, path, options
|
10
|
-
|
10
|
+
on_complete -1 do
|
11
11
|
Scruber::Core::Extensions::CsvOutput.close_all
|
12
12
|
end
|
13
13
|
end
|
data/lib/scruber/version.rb
CHANGED
data/lib/scruber.rb
CHANGED
@@ -26,6 +26,10 @@ require "scruber/core/extensions/csv_output"
|
|
26
26
|
require "scruber/core/extensions/queue_aliases"
|
27
27
|
require "scruber/core/extensions/parser_aliases"
|
28
28
|
|
29
|
+
require "scruber/helpers/dictionary_reader"
|
30
|
+
require "scruber/helpers/dictionary_reader/xml"
|
31
|
+
require "scruber/helpers/dictionary_reader/csv"
|
32
|
+
|
29
33
|
# require "scruber/core/configuration"
|
30
34
|
# require "scruber/core/configuration"
|
31
35
|
|
@@ -44,11 +48,6 @@ module Scruber
|
|
44
48
|
autoload :AbstractAdapter, "scruber/helpers/fetcher_agent_adapters/abstract_adapter"
|
45
49
|
autoload :Memory, "scruber/helpers/fetcher_agent_adapters/memory"
|
46
50
|
end
|
47
|
-
autoload :DictionaryReader, "scruber/helpers/dictionary_reader"
|
48
|
-
module DictionaryReader
|
49
|
-
autoload :Xml, "scruber/helpers/dictionary_reader/xml"
|
50
|
-
autoload :Csv, "scruber/helpers/dictionary_reader/csv"
|
51
|
-
end
|
52
51
|
end
|
53
52
|
|
54
53
|
class << self
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scruber
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ivan Goncharov
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-
|
11
|
+
date: 2018-04-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: typhoeus
|