scruber 0.1.3 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +21 -17
- data/lib/scruber/core/crawler.rb +4 -4
- data/lib/scruber/core/extensions/csv_output.rb +1 -1
- data/lib/scruber/helpers/dictionary_reader/csv.rb +1 -1
- data/lib/scruber/version.rb +1 -1
- data/lib/scruber.rb +4 -5
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8deee66960a3768ace0af72a5cb1eced62c90329
|
4
|
+
data.tar.gz: a0d3f330d8b838aee078f2d752226a1e5432b311
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 30df32ccd86afde913d47483e9f327b94869c52a21f7c1a43f442ef8a1f138a1500d0746d4641a4542937aa2dbba7e53e4697c95b234cfdc3d07eeb8ab3d13ed
|
7
|
+
data.tar.gz: 4e57023647a62f7f312a8a77b89097920e7ab6750c1fd6562d1b6b6b4b3ff239b484f08aed6920a259365100ba4efe9d3f168b1a873183a377c877261d49ba15
|
data/README.md
CHANGED
@@ -1,38 +1,42 @@
|
|
1
1
|
# Scruber
|
2
2
|
|
3
|
-
|
3
|
+
Scruber is an open source scraping framework for Ruby.
|
4
4
|
|
5
|
-
|
5
|
+
## Getting started
|
6
6
|
|
7
|
-
|
7
|
+
1. Install Scruber at the command prompt if you haven't yet:
|
8
8
|
|
9
|
-
|
9
|
+
$ gem install scruber
|
10
10
|
|
11
|
-
|
12
|
-
gem 'scruber'
|
13
|
-
```
|
11
|
+
2. Create a new workspace
|
14
12
|
|
15
|
-
|
13
|
+
$ scruber new myworkspace
|
16
14
|
|
17
|
-
|
15
|
+
3. Create a new scraper
|
18
16
|
|
19
|
-
|
17
|
+
$ scruber new scraper example
|
20
18
|
|
21
|
-
$ gem install scruber
|
22
19
|
|
23
|
-
|
20
|
+
```ruby
|
21
|
+
Scruber.run do
|
22
|
+
csv_file 'output.csv', col_sep: ','
|
23
|
+
|
24
|
+
get 'http://example.com'
|
24
25
|
|
25
|
-
|
26
|
+
parse :html do |page, html|
|
27
|
+
csv_out html.at('title').text
|
28
|
+
end
|
29
|
+
end
|
30
|
+
```
|
26
31
|
|
27
|
-
|
32
|
+
4. Run your scraper
|
28
33
|
|
29
|
-
|
34
|
+
$ scruber start example
|
30
35
|
|
31
|
-
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
32
36
|
|
33
37
|
## Contributing
|
34
38
|
|
35
|
-
Bug reports and pull requests are welcome on GitHub at https://github.com/
|
39
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/scruber/scruber.
|
36
40
|
|
37
41
|
## License
|
38
42
|
|
data/lib/scruber/core/crawler.rb
CHANGED
@@ -17,7 +17,7 @@ module Scruber
|
|
17
17
|
Scruber.configuration.merge_options(options)
|
18
18
|
@callbacks_options = {}
|
19
19
|
@callbacks = {}
|
20
|
-
@on_complete_callbacks =
|
20
|
+
@on_complete_callbacks = []
|
21
21
|
@queue = Scruber::Queue.new(scraper_name: scraper_name)
|
22
22
|
@fetcher = Scruber::Fetcher.new
|
23
23
|
load_extenstions
|
@@ -39,7 +39,7 @@ module Scruber
|
|
39
39
|
end
|
40
40
|
end
|
41
41
|
end
|
42
|
-
@on_complete_callbacks.each do |_,callback|
|
42
|
+
@on_complete_callbacks.sort_by{|c| -c[0] }.each do |(_,callback)|
|
43
43
|
instance_exec &(callback)
|
44
44
|
end
|
45
45
|
end
|
@@ -84,8 +84,8 @@ module Scruber
|
|
84
84
|
@callbacks[page_type.to_sym] = block
|
85
85
|
end
|
86
86
|
|
87
|
-
def
|
88
|
-
@on_complete_callbacks[
|
87
|
+
def on_complete(priority=1, &block)
|
88
|
+
@on_complete_callbacks.push [priority,block]
|
89
89
|
end
|
90
90
|
|
91
91
|
def process_page(page, page_type)
|
@@ -7,7 +7,7 @@ module Scruber
|
|
7
7
|
file_id = options.fetch(:file_id) { :default }.to_sym
|
8
8
|
options.delete(:file_id)
|
9
9
|
Scruber::Core::Extensions::CsvOutput.register_csv file_id, path, options
|
10
|
-
|
10
|
+
on_complete -1 do
|
11
11
|
Scruber::Core::Extensions::CsvOutput.close_all
|
12
12
|
end
|
13
13
|
end
|
data/lib/scruber/version.rb
CHANGED
data/lib/scruber.rb
CHANGED
@@ -26,6 +26,10 @@ require "scruber/core/extensions/csv_output"
|
|
26
26
|
require "scruber/core/extensions/queue_aliases"
|
27
27
|
require "scruber/core/extensions/parser_aliases"
|
28
28
|
|
29
|
+
require "scruber/helpers/dictionary_reader"
|
30
|
+
require "scruber/helpers/dictionary_reader/xml"
|
31
|
+
require "scruber/helpers/dictionary_reader/csv"
|
32
|
+
|
29
33
|
# require "scruber/core/configuration"
|
30
34
|
# require "scruber/core/configuration"
|
31
35
|
|
@@ -44,11 +48,6 @@ module Scruber
|
|
44
48
|
autoload :AbstractAdapter, "scruber/helpers/fetcher_agent_adapters/abstract_adapter"
|
45
49
|
autoload :Memory, "scruber/helpers/fetcher_agent_adapters/memory"
|
46
50
|
end
|
47
|
-
autoload :DictionaryReader, "scruber/helpers/dictionary_reader"
|
48
|
-
module DictionaryReader
|
49
|
-
autoload :Xml, "scruber/helpers/dictionary_reader/xml"
|
50
|
-
autoload :Csv, "scruber/helpers/dictionary_reader/csv"
|
51
|
-
end
|
52
51
|
end
|
53
52
|
|
54
53
|
class << self
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scruber
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ivan Goncharov
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-
|
11
|
+
date: 2018-04-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: typhoeus
|