apify_core 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/Gemfile +16 -0
- data/Gemfile.lock +79 -0
- data/LICENSE.txt +22 -0
- data/README.md +43 -0
- data/Rakefile +3 -0
- data/apify_core.gemspec +34 -0
- data/bin/bundler +16 -0
- data/bin/coderay +16 -0
- data/bin/htmldiff +16 -0
- data/bin/ldiff +16 -0
- data/bin/nokogiri +16 -0
- data/bin/pry +16 -0
- data/bin/rackup +16 -0
- data/bin/rake +16 -0
- data/bin/rspec +16 -0
- data/bin/server +16 -0
- data/bin/tilt +16 -0
- data/lib/apify_core/fetcher.rb +190 -0
- data/lib/apify_core/filter.rb +83 -0
- data/lib/apify_core/parser.rb +68 -0
- data/lib/apify_core/version.rb +5 -0
- data/lib/apify_core.rb +19 -0
- data/spec/complex_spec.rb +736 -0
- data/spec/examples/apify_request.json +62 -0
- data/spec/examples/apify_response.json +1399 -0
- data/spec/examples/github_blog_request.json +24 -0
- data/spec/examples/oblomoff_events_request.json +21 -0
- data/spec/examples/vgorode_dn_events_request.json +23 -0
- data/spec/examples/vgorode_dp_events_request.json +23 -0
- data/spec/examples/vgorode_kh_events_request.json +23 -0
- data/spec/examples/vgorode_kiev_events_request.json +23 -0
- data/spec/examples/vgorode_lg_events_request.json +23 -0
- data/spec/examples/vgorode_lviv_events_request.json +23 -0
- data/spec/examples/vgorode_od_events_request.json +23 -0
- data/spec/examples/vgorode_zp_events_request.json +23 -0
- data/spec/spec_helper.rb +8 -0
- metadata +247 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 39b891973e8c30065d8136d0b61f4fea6c4c8328
|
4
|
+
data.tar.gz: e2ce4293075a8bb218660a0e0546e86e51559b90
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: fdd15b69af5f5068a7f2fadc8afd2f746c074c757fb979569a65eeff525cca225472f21814b1869a20c147f5df3935ad50d40ab61f6a14d82009f50009077c9b
|
7
|
+
data.tar.gz: 57899e4ff289ead9ced7b376b345d4fd9f0b4b1f9b0a53221237724e2510c96bf225e6e44e1694317396bcee20caec8773c5e0fe7b044c02ddfb496e02635fb0
|
data/Gemfile
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
source 'https://rubygems.org'
|
2
|
+
|
3
|
+
# Specify your gem's dependencies in apify.gemspec
|
4
|
+
gemspec
|
5
|
+
|
6
|
+
gem 'activesupport', require: 'active_support/all'
|
7
|
+
gem 'watir-webdriver', '~> 0.6.11'
|
8
|
+
gem 'rest_client', '~> 1.8.2'
|
9
|
+
gem 'headless', '~> 1.0.2'
|
10
|
+
gem 'parallel', '~> 1.3.3'
|
11
|
+
gem 'nokogiri', '~> 1.6.5'
|
12
|
+
|
13
|
+
group :development do
|
14
|
+
gem 'awesome_print', '~> 1.2.0'
|
15
|
+
gem 'pry'
|
16
|
+
end
|
data/Gemfile.lock
ADDED
@@ -0,0 +1,79 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
apify_core (0.0.1)
|
5
|
+
activesupport
|
6
|
+
headless
|
7
|
+
json
|
8
|
+
nokogiri (~> 1.6.5)
|
9
|
+
parallel
|
10
|
+
rest_client
|
11
|
+
watir-webdriver
|
12
|
+
|
13
|
+
GEM
|
14
|
+
remote: https://rubygems.org/
|
15
|
+
specs:
|
16
|
+
activesupport (4.2.0)
|
17
|
+
i18n (~> 0.7)
|
18
|
+
json (~> 1.7, >= 1.7.7)
|
19
|
+
minitest (~> 5.1)
|
20
|
+
thread_safe (~> 0.3, >= 0.3.4)
|
21
|
+
tzinfo (~> 1.1)
|
22
|
+
awesome_print (1.2.0)
|
23
|
+
childprocess (0.5.5)
|
24
|
+
ffi (~> 1.0, >= 1.0.11)
|
25
|
+
diff-lcs (1.2.5)
|
26
|
+
ffi (1.9.6)
|
27
|
+
headless (1.0.2)
|
28
|
+
i18n (0.7.0)
|
29
|
+
json (1.8.1)
|
30
|
+
mini_portile (0.6.2)
|
31
|
+
minitest (5.5.0)
|
32
|
+
multi_json (1.10.1)
|
33
|
+
netrc (0.7.9)
|
34
|
+
nokogiri (1.6.5)
|
35
|
+
mini_portile (~> 0.6.0)
|
36
|
+
parallel (1.3.3)
|
37
|
+
rake (10.4.2)
|
38
|
+
rest_client (1.8.2)
|
39
|
+
netrc (~> 0.7.7)
|
40
|
+
rspec (3.0.0)
|
41
|
+
rspec-core (~> 3.0.0)
|
42
|
+
rspec-expectations (~> 3.0.0)
|
43
|
+
rspec-mocks (~> 3.0.0)
|
44
|
+
rspec-core (3.0.4)
|
45
|
+
rspec-support (~> 3.0.0)
|
46
|
+
rspec-expectations (3.0.4)
|
47
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
48
|
+
rspec-support (~> 3.0.0)
|
49
|
+
rspec-mocks (3.0.4)
|
50
|
+
rspec-support (~> 3.0.0)
|
51
|
+
rspec-support (3.0.4)
|
52
|
+
rubyzip (1.1.6)
|
53
|
+
selenium-webdriver (2.44.0)
|
54
|
+
childprocess (~> 0.5)
|
55
|
+
multi_json (~> 1.0)
|
56
|
+
rubyzip (~> 1.0)
|
57
|
+
websocket (~> 1.0)
|
58
|
+
thread_safe (0.3.4)
|
59
|
+
tzinfo (1.2.2)
|
60
|
+
thread_safe (~> 0.1)
|
61
|
+
watir-webdriver (0.6.11)
|
62
|
+
selenium-webdriver (>= 2.18.0)
|
63
|
+
websocket (1.2.1)
|
64
|
+
|
65
|
+
PLATFORMS
|
66
|
+
ruby
|
67
|
+
|
68
|
+
DEPENDENCIES
|
69
|
+
activesupport
|
70
|
+
apify_core!
|
71
|
+
awesome_print (~> 1.2.0)
|
72
|
+
bundler (~> 1.7)
|
73
|
+
headless (~> 1.0.2)
|
74
|
+
nokogiri (~> 1.6.5)
|
75
|
+
parallel (~> 1.3.3)
|
76
|
+
rake (~> 10.0)
|
77
|
+
rest_client (~> 1.8.2)
|
78
|
+
rspec (~> 3.0.0)
|
79
|
+
watir-webdriver (~> 0.6.11)
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2014 victorvsk
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
# Apify Core
|
2
|
+
|
3
|
+
Apify Core is a part of Apify Project. Parse HTML\XML to JSON with easy API and useful filters.
|
4
|
+
Apify Project allows even more - parsing entire website with east.
|
5
|
+
|
6
|
+
## Installation
|
7
|
+
|
8
|
+
Add this line to your application's Gemfile:
|
9
|
+
|
10
|
+
```ruby
|
11
|
+
gem 'apify_core'
|
12
|
+
```
|
13
|
+
|
14
|
+
And then execute:
|
15
|
+
|
16
|
+
$ bundle
|
17
|
+
|
18
|
+
Or install it yourself as:
|
19
|
+
|
20
|
+
$ gem install apify_core
|
21
|
+
|
22
|
+
## Usage
|
23
|
+
|
24
|
+
```
|
25
|
+
html = RestClient.get('http://github.com')
|
26
|
+
pattern = { title: '<% title %>' }
|
27
|
+
title = Apify::Core.new(html, pattern).perform # GitHub · Build software better, together.
|
28
|
+
```
|
29
|
+
|
30
|
+
```
|
31
|
+
request = { github: { url: ['http://github.com'], js: false, host: 'http://github.com', pattern: { title: '<% title %>' } } }
|
32
|
+
response = Apify.crawl!(request) # { "github": { "title": "GitHub · Build software better, together." } }
|
33
|
+
```
|
34
|
+
|
35
|
+
See more in documentation (TODO). Also some syntax examples can be found in spec/examples.
|
36
|
+
|
37
|
+
## Contributing
|
38
|
+
|
39
|
+
1. Fork it ( https://github.com/victorvsk/apify-core/fork )
|
40
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
41
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
42
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
43
|
+
5. Create a new Pull Request
|
data/Rakefile
ADDED
data/apify_core.gemspec
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'apify_core/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "apify_core"
|
8
|
+
spec.version = Apify::Core::VERSION
|
9
|
+
spec.authors = ["victorvsk"]
|
10
|
+
spec.email = ["victor@vyskrebentsev.ru"]
|
11
|
+
spec.summary = %q{Core part of Apify project. An easy way to parse HTML\XML content and crawl websites in a normalized and centralized way.}
|
12
|
+
spec.description = %q{Simple API to transform from simple HTML to JSON to entire website to JSON.}
|
13
|
+
spec.homepage = ""
|
14
|
+
spec.license = "MIT"
|
15
|
+
|
16
|
+
spec.files = `git ls-files -z`.split("\x0")
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.add_development_dependency "bundler", "~> 1.7"
|
22
|
+
spec.add_development_dependency "rake", "~> 10.0"
|
23
|
+
spec.add_development_dependency "rspec", "~> 3.0.0"
|
24
|
+
|
25
|
+
|
26
|
+
spec.add_dependency 'watir-webdriver'
|
27
|
+
spec.add_dependency 'rest_client'
|
28
|
+
spec.add_dependency 'headless'
|
29
|
+
spec.add_dependency 'parallel'
|
30
|
+
spec.add_dependency 'nokogiri', '~> 1.6.5'
|
31
|
+
spec.add_dependency 'json'
|
32
|
+
spec.add_dependency 'activesupport'
|
33
|
+
|
34
|
+
end
|
data/bin/bundler
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file was generated by Bundler.
|
4
|
+
#
|
5
|
+
# The application 'bundler' is installed as part of a gem, and
|
6
|
+
# this file is here to facilitate running it.
|
7
|
+
#
|
8
|
+
|
9
|
+
require 'pathname'
|
10
|
+
ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile",
|
11
|
+
Pathname.new(__FILE__).realpath)
|
12
|
+
|
13
|
+
require 'rubygems'
|
14
|
+
require 'bundler/setup'
|
15
|
+
|
16
|
+
load Gem.bin_path('bundler', 'bundler')
|
data/bin/coderay
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file was generated by Bundler.
|
4
|
+
#
|
5
|
+
# The application 'coderay' is installed as part of a gem, and
|
6
|
+
# this file is here to facilitate running it.
|
7
|
+
#
|
8
|
+
|
9
|
+
require 'pathname'
|
10
|
+
ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile",
|
11
|
+
Pathname.new(__FILE__).realpath)
|
12
|
+
|
13
|
+
require 'rubygems'
|
14
|
+
require 'bundler/setup'
|
15
|
+
|
16
|
+
load Gem.bin_path('coderay', 'coderay')
|
data/bin/htmldiff
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file was generated by Bundler.
|
4
|
+
#
|
5
|
+
# The application 'htmldiff' is installed as part of a gem, and
|
6
|
+
# this file is here to facilitate running it.
|
7
|
+
#
|
8
|
+
|
9
|
+
require 'pathname'
|
10
|
+
ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile",
|
11
|
+
Pathname.new(__FILE__).realpath)
|
12
|
+
|
13
|
+
require 'rubygems'
|
14
|
+
require 'bundler/setup'
|
15
|
+
|
16
|
+
load Gem.bin_path('diff-lcs', 'htmldiff')
|
data/bin/ldiff
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file was generated by Bundler.
|
4
|
+
#
|
5
|
+
# The application 'ldiff' is installed as part of a gem, and
|
6
|
+
# this file is here to facilitate running it.
|
7
|
+
#
|
8
|
+
|
9
|
+
require 'pathname'
|
10
|
+
ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile",
|
11
|
+
Pathname.new(__FILE__).realpath)
|
12
|
+
|
13
|
+
require 'rubygems'
|
14
|
+
require 'bundler/setup'
|
15
|
+
|
16
|
+
load Gem.bin_path('diff-lcs', 'ldiff')
|
data/bin/nokogiri
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file was generated by Bundler.
|
4
|
+
#
|
5
|
+
# The application 'nokogiri' is installed as part of a gem, and
|
6
|
+
# this file is here to facilitate running it.
|
7
|
+
#
|
8
|
+
|
9
|
+
require 'pathname'
|
10
|
+
ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile",
|
11
|
+
Pathname.new(__FILE__).realpath)
|
12
|
+
|
13
|
+
require 'rubygems'
|
14
|
+
require 'bundler/setup'
|
15
|
+
|
16
|
+
load Gem.bin_path('nokogiri', 'nokogiri')
|
data/bin/pry
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file was generated by Bundler.
|
4
|
+
#
|
5
|
+
# The application 'pry' is installed as part of a gem, and
|
6
|
+
# this file is here to facilitate running it.
|
7
|
+
#
|
8
|
+
|
9
|
+
require 'pathname'
|
10
|
+
ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile",
|
11
|
+
Pathname.new(__FILE__).realpath)
|
12
|
+
|
13
|
+
require 'rubygems'
|
14
|
+
require 'bundler/setup'
|
15
|
+
|
16
|
+
load Gem.bin_path('pry', 'pry')
|
data/bin/rackup
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file was generated by Bundler.
|
4
|
+
#
|
5
|
+
# The application 'rackup' is installed as part of a gem, and
|
6
|
+
# this file is here to facilitate running it.
|
7
|
+
#
|
8
|
+
|
9
|
+
require 'pathname'
|
10
|
+
ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile",
|
11
|
+
Pathname.new(__FILE__).realpath)
|
12
|
+
|
13
|
+
require 'rubygems'
|
14
|
+
require 'bundler/setup'
|
15
|
+
|
16
|
+
load Gem.bin_path('rack', 'rackup')
|
data/bin/rake
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file was generated by Bundler.
|
4
|
+
#
|
5
|
+
# The application 'rake' is installed as part of a gem, and
|
6
|
+
# this file is here to facilitate running it.
|
7
|
+
#
|
8
|
+
|
9
|
+
require 'pathname'
|
10
|
+
ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile",
|
11
|
+
Pathname.new(__FILE__).realpath)
|
12
|
+
|
13
|
+
require 'rubygems'
|
14
|
+
require 'bundler/setup'
|
15
|
+
|
16
|
+
load Gem.bin_path('rake', 'rake')
|
data/bin/rspec
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file was generated by Bundler.
|
4
|
+
#
|
5
|
+
# The application 'rspec' is installed as part of a gem, and
|
6
|
+
# this file is here to facilitate running it.
|
7
|
+
#
|
8
|
+
|
9
|
+
require 'pathname'
|
10
|
+
ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile",
|
11
|
+
Pathname.new(__FILE__).realpath)
|
12
|
+
|
13
|
+
require 'rubygems'
|
14
|
+
require 'bundler/setup'
|
15
|
+
|
16
|
+
load Gem.bin_path('rspec-core', 'rspec')
|
data/bin/server
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file was generated by Bundler.
|
4
|
+
#
|
5
|
+
# The application 'server' is installed as part of a gem, and
|
6
|
+
# this file is here to facilitate running it.
|
7
|
+
#
|
8
|
+
|
9
|
+
require 'pathname'
|
10
|
+
ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile",
|
11
|
+
Pathname.new(__FILE__).realpath)
|
12
|
+
|
13
|
+
require 'rubygems'
|
14
|
+
require 'bundler/setup'
|
15
|
+
|
16
|
+
load Gem.bin_path('apify', 'server')
|
data/bin/tilt
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file was generated by Bundler.
|
4
|
+
#
|
5
|
+
# The application 'tilt' is installed as part of a gem, and
|
6
|
+
# this file is here to facilitate running it.
|
7
|
+
#
|
8
|
+
|
9
|
+
require 'pathname'
|
10
|
+
ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile",
|
11
|
+
Pathname.new(__FILE__).realpath)
|
12
|
+
|
13
|
+
require 'rubygems'
|
14
|
+
require 'bundler/setup'
|
15
|
+
|
16
|
+
load Gem.bin_path('tilt', 'tilt')
|
@@ -0,0 +1,190 @@
|
|
1
|
+
module Apify
|
2
|
+
module Core
|
3
|
+
class Fetcher
|
4
|
+
attr_accessor :sources
|
5
|
+
attr_accessor :result
|
6
|
+
|
7
|
+
def initialize( pages, processes=2, delay=0 )
|
8
|
+
@pages = pages
|
9
|
+
@processes = processes
|
10
|
+
@delay = delay
|
11
|
+
end
|
12
|
+
|
13
|
+
def prepare
|
14
|
+
@pages.each do |key, value|
|
15
|
+
value[:url] = Filter.apply(value[:url], ['map_urlencode'])
|
16
|
+
self.class.send(:attr_reader, key.to_sym)
|
17
|
+
js = value[:js] || false
|
18
|
+
pattern = value[:pattern]
|
19
|
+
host = value[:host]
|
20
|
+
if value[:from]
|
21
|
+
v = value
|
22
|
+
instance_variable_set("@#{key}".to_sym, v)
|
23
|
+
next
|
24
|
+
end
|
25
|
+
url = self.class.base_url_for(value[:url], value[:host])
|
26
|
+
|
27
|
+
if value[:paginate]
|
28
|
+
pages = self.class.paginate(url: url, to_replace: value[:paginate][0], pagination: value[:paginate][1])
|
29
|
+
result = { pages: pages }
|
30
|
+
else
|
31
|
+
url = (url.respond_to?(:each) ? url : url.to_s)
|
32
|
+
result = { pages: [url] }
|
33
|
+
end
|
34
|
+
result[:js] = js
|
35
|
+
result[:pattern] = pattern
|
36
|
+
result[:host] = host
|
37
|
+
instance_variable_set("@#{key}".to_sym, result)
|
38
|
+
|
39
|
+
end
|
40
|
+
self.sources = @pages.keys
|
41
|
+
end
|
42
|
+
|
43
|
+
def perform
|
44
|
+
parenthesis_args = /\([\"\'](.*?)[\"\']\)/
|
45
|
+
self.sources.each do |source|
|
46
|
+
|
47
|
+
if @pages[source].key?(:from)
|
48
|
+
expression = @pages[source][:from]
|
49
|
+
statement = {
|
50
|
+
select: expression.match(/select#{parenthesis_args}/)[1],
|
51
|
+
from: expression.match(/from#{parenthesis_args}/)[1],
|
52
|
+
filters: (expression.match(/filter#{parenthesis_args}/)[1].split('|').map(&:strip) rescue nil),
|
53
|
+
}
|
54
|
+
statement[:filters] = ['mapattr_href', 'map_urlencode'] unless statement[:filters].present?
|
55
|
+
urls = Filter.apply(Parser.fetch(statement[:select], self.send(statement[:from])[:pages]), statement[:filters])
|
56
|
+
@pages[source][:pages] = self.class.base_url_for(urls, @pages[source][:host])
|
57
|
+
end
|
58
|
+
src = self.send(source)
|
59
|
+
method = src[:js] ? :js : :normal
|
60
|
+
processes_number = (method == :js ? 1 : @processes )
|
61
|
+
src[:pages] = ::Parallel.map(src[:pages], in_processes: processes_number) do |url_or_array|
|
62
|
+
if url_or_array.respond_to?(:each)
|
63
|
+
res = []
|
64
|
+
url_or_array.each do |url|
|
65
|
+
|
66
|
+
res << self.class.download( url, method )
|
67
|
+
end
|
68
|
+
res
|
69
|
+
else
|
70
|
+
self.class.download( url_or_array, method )
|
71
|
+
end
|
72
|
+
end.flatten
|
73
|
+
end
|
74
|
+
|
75
|
+
result = {}
|
76
|
+
self.sources.each do |source|
|
77
|
+
src = self.send(source)
|
78
|
+
pattern = src[:pattern] ? src[:pattern].dup : false
|
79
|
+
elem = if src[:pattern]
|
80
|
+
src[:pages].map{ |html| Parser.new(html, src[:pattern]).perform }
|
81
|
+
else
|
82
|
+
|
83
|
+
src[:pages]
|
84
|
+
end
|
85
|
+
|
86
|
+
result[source] = elem if pattern
|
87
|
+
instance_variable_set("@#{source}".to_sym, elem )
|
88
|
+
end
|
89
|
+
|
90
|
+
@result = result
|
91
|
+
@json = result
|
92
|
+
end
|
93
|
+
|
94
|
+
def to_json
|
95
|
+
(@json || perform).to_json
|
96
|
+
end
|
97
|
+
|
98
|
+
class << self
|
99
|
+
|
100
|
+
# Fetcher::Fetcher.paginate(url: 'http://site.com', to_replace: '(\/?)$', pagination: '?page=<% 1,5,1 %>')
|
101
|
+
|
102
|
+
def paginate( opts={} )
|
103
|
+
pagination = opts[:pagination] || '?page=<% 1,5,1 %>'
|
104
|
+
to_replace = opts[:to_replace] || '(\/?)\Z'
|
105
|
+
url_or_array_of_urls = opts[:url]
|
106
|
+
raise ArgumentError, "URL parameter missing" if url_or_array_of_urls.nil?
|
107
|
+
regexp = /<%\s?+(\d+,\d+,\d+)\s?+%>/
|
108
|
+
pattern = pagination.scan(regexp)
|
109
|
+
return [opts[:url]] if pattern.count == 0
|
110
|
+
raise ArgumentError, "Only one pagination pattern allowed." if pattern.count > 1
|
111
|
+
result = []
|
112
|
+
pager_args = pattern.first.first.split(',').map(&:strip).map(&:to_i)
|
113
|
+
range = (pager_args[0]..pager_args[1])
|
114
|
+
range.step(pager_args[2]).each do |page|
|
115
|
+
|
116
|
+
to_append = pagination.gsub(regexp, page.to_s)
|
117
|
+
if url_or_array_of_urls.respond_to?(:each)
|
118
|
+
url_or_array_of_urls.each do |url|
|
119
|
+
result << url.to_s.chomp('/').gsub(Regexp.new(to_replace), to_append)
|
120
|
+
end
|
121
|
+
else
|
122
|
+
result << url_or_array_of_urls.to_s.chomp('/').gsub(Regexp.new(to_replace), to_append)
|
123
|
+
end
|
124
|
+
|
125
|
+
end
|
126
|
+
|
127
|
+
result
|
128
|
+
end
|
129
|
+
|
130
|
+
def download( url, method=:normal )
|
131
|
+
sleep @delay if @delay.to_i > 0
|
132
|
+
result = case method
|
133
|
+
when :js
|
134
|
+
headless = Headless.new
|
135
|
+
headless.start
|
136
|
+
browser = Watir::Browser.new
|
137
|
+
browser.goto url
|
138
|
+
html = browser.html
|
139
|
+
headless.destroy
|
140
|
+
print "+"
|
141
|
+
html
|
142
|
+
when :normal
|
143
|
+
begin
|
144
|
+
html = RestClient.get(url,
|
145
|
+
'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
146
|
+
'Accept-Language' => 'ru-RU,ru;q=0.8,en-US;q=0.6,en;q=0.4',
|
147
|
+
'Connection' => 'keep-alive',
|
148
|
+
'User-Agent' => 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/39.0.2171.65 Chrome/39.0.2171.65 Safari/537.36')
|
149
|
+
d = Nokogiri::HTML(html)
|
150
|
+
charset = d.search('meta[@http-equiv="content-type"]').last['content'].match(/charset=(.+)/)[1].downcase rescue nil
|
151
|
+
if charset.present? and charset != 'utf-8'
|
152
|
+
html = html.force_encoding(charset).encode("utf-8", undef: :replace)
|
153
|
+
end
|
154
|
+
print "+"
|
155
|
+
html
|
156
|
+
rescue RestClient::RequestTimeout, RestClient::ResourceNotFound, RestClient::InternalServerError, URI::InvalidURIError, RestClient::Forbidden,RestClient::BadGateway, RestClient
|
157
|
+
print "-"
|
158
|
+
return
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
162
|
+
result
|
163
|
+
end
|
164
|
+
|
165
|
+
def base_url_for(url_or_array, base_url)
|
166
|
+
|
167
|
+
if url_or_array.respond_to?(:each)
|
168
|
+
result = []
|
169
|
+
url_or_array.each do |url|
|
170
|
+
#url = URI(URI.encode(url))
|
171
|
+
url = URI(url)
|
172
|
+
raise ArgumentError, "No host provided." if url.host.nil? and base_url.nil?
|
173
|
+
result << (url.host ? url.to_s : "#{base_url}#{url}")
|
174
|
+
end
|
175
|
+
result
|
176
|
+
else
|
177
|
+
#url = URI(URI.encode(url_or_array))
|
178
|
+
url = URI(url_or_array)
|
179
|
+
raise ArgumentError, "No host provided." if url.host.nil? and base_url.nil?
|
180
|
+
(url.host ? url : "#{base_url}#{url}")
|
181
|
+
end
|
182
|
+
|
183
|
+
end
|
184
|
+
|
185
|
+
end
|
186
|
+
|
187
|
+
|
188
|
+
end
|
189
|
+
end
|
190
|
+
end
|
@@ -0,0 +1,83 @@
|
|
1
|
+
module Apify
|
2
|
+
module Core
|
3
|
+
module Filter
|
4
|
+
class << self
|
5
|
+
|
6
|
+
def apply(node_or_str, filters=[])
|
7
|
+
return node_or_str if filters.empty? or node_or_str.nil?
|
8
|
+
method = filters.shift
|
9
|
+
filtered_value = send(method, node_or_str)
|
10
|
+
apply(filtered_value, filters)
|
11
|
+
end
|
12
|
+
|
13
|
+
private
|
14
|
+
|
15
|
+
def first(node)
|
16
|
+
node.first
|
17
|
+
end
|
18
|
+
|
19
|
+
def text(node)
|
20
|
+
node.text
|
21
|
+
end
|
22
|
+
|
23
|
+
def strip(str)
|
24
|
+
str.strip if str
|
25
|
+
end
|
26
|
+
|
27
|
+
def list(node)
|
28
|
+
node
|
29
|
+
end
|
30
|
+
|
31
|
+
def html(node)
|
32
|
+
node.to_s
|
33
|
+
end
|
34
|
+
|
35
|
+
def inner_html(node)
|
36
|
+
node.inner_html.to_s
|
37
|
+
end
|
38
|
+
|
39
|
+
def map_text(node)
|
40
|
+
node.map(&:text).map(&:strip)
|
41
|
+
end
|
42
|
+
|
43
|
+
def map_html(node)
|
44
|
+
node.map(&:to_s)
|
45
|
+
end
|
46
|
+
|
47
|
+
def map_inner_html(node)
|
48
|
+
node.map(&:inner_html).map(&:to_s)
|
49
|
+
end
|
50
|
+
|
51
|
+
|
52
|
+
def map_urlencode(node)
|
53
|
+
node.map do |url|
|
54
|
+
urlencode(url)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
def urlencode(url)
|
59
|
+
url = begin
|
60
|
+
url = URI(url)
|
61
|
+
url
|
62
|
+
rescue URI::InvalidURIError
|
63
|
+
URI.encode(url)
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
def method_missing(method_sym, *arguments, &block)
|
68
|
+
if method_sym =~ /\Amapattr_/
|
69
|
+
attribute = method_sym.to_s.gsub('mapattr_', '')
|
70
|
+
arguments.first.map{ |n| n[attribute] }
|
71
|
+
elsif method_sym =~ /\Aattr_/
|
72
|
+
attribute = method_sym.to_s.gsub('attr_', '')
|
73
|
+
arguments.first[attribute]
|
74
|
+
else
|
75
|
+
super
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|