apify_core 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/Gemfile +16 -0
- data/Gemfile.lock +79 -0
- data/LICENSE.txt +22 -0
- data/README.md +43 -0
- data/Rakefile +3 -0
- data/apify_core.gemspec +34 -0
- data/bin/bundler +16 -0
- data/bin/coderay +16 -0
- data/bin/htmldiff +16 -0
- data/bin/ldiff +16 -0
- data/bin/nokogiri +16 -0
- data/bin/pry +16 -0
- data/bin/rackup +16 -0
- data/bin/rake +16 -0
- data/bin/rspec +16 -0
- data/bin/server +16 -0
- data/bin/tilt +16 -0
- data/lib/apify_core/fetcher.rb +190 -0
- data/lib/apify_core/filter.rb +83 -0
- data/lib/apify_core/parser.rb +68 -0
- data/lib/apify_core/version.rb +5 -0
- data/lib/apify_core.rb +19 -0
- data/spec/complex_spec.rb +736 -0
- data/spec/examples/apify_request.json +62 -0
- data/spec/examples/apify_response.json +1399 -0
- data/spec/examples/github_blog_request.json +24 -0
- data/spec/examples/oblomoff_events_request.json +21 -0
- data/spec/examples/vgorode_dn_events_request.json +23 -0
- data/spec/examples/vgorode_dp_events_request.json +23 -0
- data/spec/examples/vgorode_kh_events_request.json +23 -0
- data/spec/examples/vgorode_kiev_events_request.json +23 -0
- data/spec/examples/vgorode_lg_events_request.json +23 -0
- data/spec/examples/vgorode_lviv_events_request.json +23 -0
- data/spec/examples/vgorode_od_events_request.json +23 -0
- data/spec/examples/vgorode_zp_events_request.json +23 -0
- data/spec/spec_helper.rb +8 -0
- metadata +247 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 39b891973e8c30065d8136d0b61f4fea6c4c8328
|
4
|
+
data.tar.gz: e2ce4293075a8bb218660a0e0546e86e51559b90
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: fdd15b69af5f5068a7f2fadc8afd2f746c074c757fb979569a65eeff525cca225472f21814b1869a20c147f5df3935ad50d40ab61f6a14d82009f50009077c9b
|
7
|
+
data.tar.gz: 57899e4ff289ead9ced7b376b345d4fd9f0b4b1f9b0a53221237724e2510c96bf225e6e44e1694317396bcee20caec8773c5e0fe7b044c02ddfb496e02635fb0
|
data/Gemfile
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
source 'https://rubygems.org'
|
2
|
+
|
3
|
+
# Specify your gem's dependencies in apify.gemspec
|
4
|
+
gemspec
|
5
|
+
|
6
|
+
gem 'activesupport', require: 'active_support/all'
|
7
|
+
gem 'watir-webdriver', '~> 0.6.11'
|
8
|
+
gem 'rest_client', '~> 1.8.2'
|
9
|
+
gem 'headless', '~> 1.0.2'
|
10
|
+
gem 'parallel', '~> 1.3.3'
|
11
|
+
gem 'nokogiri', '~> 1.6.5'
|
12
|
+
|
13
|
+
group :development do
|
14
|
+
gem 'awesome_print', '~> 1.2.0'
|
15
|
+
gem 'pry'
|
16
|
+
end
|
data/Gemfile.lock
ADDED
@@ -0,0 +1,79 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
apify_core (0.0.1)
|
5
|
+
activesupport
|
6
|
+
headless
|
7
|
+
json
|
8
|
+
nokogiri (~> 1.6.5)
|
9
|
+
parallel
|
10
|
+
rest_client
|
11
|
+
watir-webdriver
|
12
|
+
|
13
|
+
GEM
|
14
|
+
remote: https://rubygems.org/
|
15
|
+
specs:
|
16
|
+
activesupport (4.2.0)
|
17
|
+
i18n (~> 0.7)
|
18
|
+
json (~> 1.7, >= 1.7.7)
|
19
|
+
minitest (~> 5.1)
|
20
|
+
thread_safe (~> 0.3, >= 0.3.4)
|
21
|
+
tzinfo (~> 1.1)
|
22
|
+
awesome_print (1.2.0)
|
23
|
+
childprocess (0.5.5)
|
24
|
+
ffi (~> 1.0, >= 1.0.11)
|
25
|
+
diff-lcs (1.2.5)
|
26
|
+
ffi (1.9.6)
|
27
|
+
headless (1.0.2)
|
28
|
+
i18n (0.7.0)
|
29
|
+
json (1.8.1)
|
30
|
+
mini_portile (0.6.2)
|
31
|
+
minitest (5.5.0)
|
32
|
+
multi_json (1.10.1)
|
33
|
+
netrc (0.7.9)
|
34
|
+
nokogiri (1.6.5)
|
35
|
+
mini_portile (~> 0.6.0)
|
36
|
+
parallel (1.3.3)
|
37
|
+
rake (10.4.2)
|
38
|
+
rest_client (1.8.2)
|
39
|
+
netrc (~> 0.7.7)
|
40
|
+
rspec (3.0.0)
|
41
|
+
rspec-core (~> 3.0.0)
|
42
|
+
rspec-expectations (~> 3.0.0)
|
43
|
+
rspec-mocks (~> 3.0.0)
|
44
|
+
rspec-core (3.0.4)
|
45
|
+
rspec-support (~> 3.0.0)
|
46
|
+
rspec-expectations (3.0.4)
|
47
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
48
|
+
rspec-support (~> 3.0.0)
|
49
|
+
rspec-mocks (3.0.4)
|
50
|
+
rspec-support (~> 3.0.0)
|
51
|
+
rspec-support (3.0.4)
|
52
|
+
rubyzip (1.1.6)
|
53
|
+
selenium-webdriver (2.44.0)
|
54
|
+
childprocess (~> 0.5)
|
55
|
+
multi_json (~> 1.0)
|
56
|
+
rubyzip (~> 1.0)
|
57
|
+
websocket (~> 1.0)
|
58
|
+
thread_safe (0.3.4)
|
59
|
+
tzinfo (1.2.2)
|
60
|
+
thread_safe (~> 0.1)
|
61
|
+
watir-webdriver (0.6.11)
|
62
|
+
selenium-webdriver (>= 2.18.0)
|
63
|
+
websocket (1.2.1)
|
64
|
+
|
65
|
+
PLATFORMS
|
66
|
+
ruby
|
67
|
+
|
68
|
+
DEPENDENCIES
|
69
|
+
activesupport
|
70
|
+
apify_core!
|
71
|
+
awesome_print (~> 1.2.0)
|
72
|
+
bundler (~> 1.7)
|
73
|
+
headless (~> 1.0.2)
|
74
|
+
nokogiri (~> 1.6.5)
|
75
|
+
parallel (~> 1.3.3)
|
76
|
+
rake (~> 10.0)
|
77
|
+
rest_client (~> 1.8.2)
|
78
|
+
rspec (~> 3.0.0)
|
79
|
+
watir-webdriver (~> 0.6.11)
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2014 victorvsk
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
# Apify Core
|
2
|
+
|
3
|
+
Apify Core is a part of Apify Project. Parse HTML\XML to JSON with easy API and useful filters.
|
4
|
+
Apify Project allows even more - parsing entire website with east.
|
5
|
+
|
6
|
+
## Installation
|
7
|
+
|
8
|
+
Add this line to your application's Gemfile:
|
9
|
+
|
10
|
+
```ruby
|
11
|
+
gem 'apify_core'
|
12
|
+
```
|
13
|
+
|
14
|
+
And then execute:
|
15
|
+
|
16
|
+
$ bundle
|
17
|
+
|
18
|
+
Or install it yourself as:
|
19
|
+
|
20
|
+
$ gem install apify_core
|
21
|
+
|
22
|
+
## Usage
|
23
|
+
|
24
|
+
```
|
25
|
+
html = RestClient.get('http://github.com')
|
26
|
+
pattern = { title: '<% title %>' }
|
27
|
+
title = Apify::Core.new(html, pattern).perform # GitHub · Build software better, together.
|
28
|
+
```
|
29
|
+
|
30
|
+
```
|
31
|
+
request = { github: { url: ['http://github.com'], js: false, host: 'http://github.com', pattern: { title: '<% title %>' } } }
|
32
|
+
response = Apify.crawl!(request) # { "github": { "title": "GitHub · Build software better, together." } }
|
33
|
+
```
|
34
|
+
|
35
|
+
See more in documentation (TODO). Also some syntax examples can be found in spec/examples.
|
36
|
+
|
37
|
+
## Contributing
|
38
|
+
|
39
|
+
1. Fork it ( https://github.com/victorvsk/apify-core/fork )
|
40
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
41
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
42
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
43
|
+
5. Create a new Pull Request
|
data/Rakefile
ADDED
data/apify_core.gemspec
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'apify_core/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "apify_core"
|
8
|
+
spec.version = Apify::Core::VERSION
|
9
|
+
spec.authors = ["victorvsk"]
|
10
|
+
spec.email = ["victor@vyskrebentsev.ru"]
|
11
|
+
spec.summary = %q{Core part of Apify project. An easy way to parse HTML\XML content and crawl websites in a normalized and centralized way.}
|
12
|
+
spec.description = %q{Simple API to transform from simple HTML to JSON to entire website to JSON.}
|
13
|
+
spec.homepage = ""
|
14
|
+
spec.license = "MIT"
|
15
|
+
|
16
|
+
spec.files = `git ls-files -z`.split("\x0")
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.add_development_dependency "bundler", "~> 1.7"
|
22
|
+
spec.add_development_dependency "rake", "~> 10.0"
|
23
|
+
spec.add_development_dependency "rspec", "~> 3.0.0"
|
24
|
+
|
25
|
+
|
26
|
+
spec.add_dependency 'watir-webdriver'
|
27
|
+
spec.add_dependency 'rest_client'
|
28
|
+
spec.add_dependency 'headless'
|
29
|
+
spec.add_dependency 'parallel'
|
30
|
+
spec.add_dependency 'nokogiri', '~> 1.6.5'
|
31
|
+
spec.add_dependency 'json'
|
32
|
+
spec.add_dependency 'activesupport'
|
33
|
+
|
34
|
+
end
|
data/bin/bundler
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file was generated by Bundler.
|
4
|
+
#
|
5
|
+
# The application 'bundler' is installed as part of a gem, and
|
6
|
+
# this file is here to facilitate running it.
|
7
|
+
#
|
8
|
+
|
9
|
+
require 'pathname'
|
10
|
+
ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile",
|
11
|
+
Pathname.new(__FILE__).realpath)
|
12
|
+
|
13
|
+
require 'rubygems'
|
14
|
+
require 'bundler/setup'
|
15
|
+
|
16
|
+
load Gem.bin_path('bundler', 'bundler')
|
data/bin/coderay
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file was generated by Bundler.
|
4
|
+
#
|
5
|
+
# The application 'coderay' is installed as part of a gem, and
|
6
|
+
# this file is here to facilitate running it.
|
7
|
+
#
|
8
|
+
|
9
|
+
require 'pathname'
|
10
|
+
ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile",
|
11
|
+
Pathname.new(__FILE__).realpath)
|
12
|
+
|
13
|
+
require 'rubygems'
|
14
|
+
require 'bundler/setup'
|
15
|
+
|
16
|
+
load Gem.bin_path('coderay', 'coderay')
|
data/bin/htmldiff
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file was generated by Bundler.
|
4
|
+
#
|
5
|
+
# The application 'htmldiff' is installed as part of a gem, and
|
6
|
+
# this file is here to facilitate running it.
|
7
|
+
#
|
8
|
+
|
9
|
+
require 'pathname'
|
10
|
+
ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile",
|
11
|
+
Pathname.new(__FILE__).realpath)
|
12
|
+
|
13
|
+
require 'rubygems'
|
14
|
+
require 'bundler/setup'
|
15
|
+
|
16
|
+
load Gem.bin_path('diff-lcs', 'htmldiff')
|
data/bin/ldiff
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file was generated by Bundler.
|
4
|
+
#
|
5
|
+
# The application 'ldiff' is installed as part of a gem, and
|
6
|
+
# this file is here to facilitate running it.
|
7
|
+
#
|
8
|
+
|
9
|
+
require 'pathname'
|
10
|
+
ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile",
|
11
|
+
Pathname.new(__FILE__).realpath)
|
12
|
+
|
13
|
+
require 'rubygems'
|
14
|
+
require 'bundler/setup'
|
15
|
+
|
16
|
+
load Gem.bin_path('diff-lcs', 'ldiff')
|
data/bin/nokogiri
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file was generated by Bundler.
|
4
|
+
#
|
5
|
+
# The application 'nokogiri' is installed as part of a gem, and
|
6
|
+
# this file is here to facilitate running it.
|
7
|
+
#
|
8
|
+
|
9
|
+
require 'pathname'
|
10
|
+
ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile",
|
11
|
+
Pathname.new(__FILE__).realpath)
|
12
|
+
|
13
|
+
require 'rubygems'
|
14
|
+
require 'bundler/setup'
|
15
|
+
|
16
|
+
load Gem.bin_path('nokogiri', 'nokogiri')
|
data/bin/pry
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file was generated by Bundler.
|
4
|
+
#
|
5
|
+
# The application 'pry' is installed as part of a gem, and
|
6
|
+
# this file is here to facilitate running it.
|
7
|
+
#
|
8
|
+
|
9
|
+
require 'pathname'
|
10
|
+
ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile",
|
11
|
+
Pathname.new(__FILE__).realpath)
|
12
|
+
|
13
|
+
require 'rubygems'
|
14
|
+
require 'bundler/setup'
|
15
|
+
|
16
|
+
load Gem.bin_path('pry', 'pry')
|
data/bin/rackup
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file was generated by Bundler.
|
4
|
+
#
|
5
|
+
# The application 'rackup' is installed as part of a gem, and
|
6
|
+
# this file is here to facilitate running it.
|
7
|
+
#
|
8
|
+
|
9
|
+
require 'pathname'
|
10
|
+
ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile",
|
11
|
+
Pathname.new(__FILE__).realpath)
|
12
|
+
|
13
|
+
require 'rubygems'
|
14
|
+
require 'bundler/setup'
|
15
|
+
|
16
|
+
load Gem.bin_path('rack', 'rackup')
|
data/bin/rake
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file was generated by Bundler.
|
4
|
+
#
|
5
|
+
# The application 'rake' is installed as part of a gem, and
|
6
|
+
# this file is here to facilitate running it.
|
7
|
+
#
|
8
|
+
|
9
|
+
require 'pathname'
|
10
|
+
ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile",
|
11
|
+
Pathname.new(__FILE__).realpath)
|
12
|
+
|
13
|
+
require 'rubygems'
|
14
|
+
require 'bundler/setup'
|
15
|
+
|
16
|
+
load Gem.bin_path('rake', 'rake')
|
data/bin/rspec
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file was generated by Bundler.
|
4
|
+
#
|
5
|
+
# The application 'rspec' is installed as part of a gem, and
|
6
|
+
# this file is here to facilitate running it.
|
7
|
+
#
|
8
|
+
|
9
|
+
require 'pathname'
|
10
|
+
ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile",
|
11
|
+
Pathname.new(__FILE__).realpath)
|
12
|
+
|
13
|
+
require 'rubygems'
|
14
|
+
require 'bundler/setup'
|
15
|
+
|
16
|
+
load Gem.bin_path('rspec-core', 'rspec')
|
data/bin/server
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file was generated by Bundler.
|
4
|
+
#
|
5
|
+
# The application 'server' is installed as part of a gem, and
|
6
|
+
# this file is here to facilitate running it.
|
7
|
+
#
|
8
|
+
|
9
|
+
require 'pathname'
|
10
|
+
ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile",
|
11
|
+
Pathname.new(__FILE__).realpath)
|
12
|
+
|
13
|
+
require 'rubygems'
|
14
|
+
require 'bundler/setup'
|
15
|
+
|
16
|
+
load Gem.bin_path('apify', 'server')
|
data/bin/tilt
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file was generated by Bundler.
|
4
|
+
#
|
5
|
+
# The application 'tilt' is installed as part of a gem, and
|
6
|
+
# this file is here to facilitate running it.
|
7
|
+
#
|
8
|
+
|
9
|
+
require 'pathname'
|
10
|
+
ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile",
|
11
|
+
Pathname.new(__FILE__).realpath)
|
12
|
+
|
13
|
+
require 'rubygems'
|
14
|
+
require 'bundler/setup'
|
15
|
+
|
16
|
+
load Gem.bin_path('tilt', 'tilt')
|
@@ -0,0 +1,190 @@
|
|
1
|
+
module Apify
|
2
|
+
module Core
|
3
|
+
class Fetcher
|
4
|
+
attr_accessor :sources
|
5
|
+
attr_accessor :result
|
6
|
+
|
7
|
+
def initialize( pages, processes=2, delay=0 )
|
8
|
+
@pages = pages
|
9
|
+
@processes = processes
|
10
|
+
@delay = delay
|
11
|
+
end
|
12
|
+
|
13
|
+
def prepare
|
14
|
+
@pages.each do |key, value|
|
15
|
+
value[:url] = Filter.apply(value[:url], ['map_urlencode'])
|
16
|
+
self.class.send(:attr_reader, key.to_sym)
|
17
|
+
js = value[:js] || false
|
18
|
+
pattern = value[:pattern]
|
19
|
+
host = value[:host]
|
20
|
+
if value[:from]
|
21
|
+
v = value
|
22
|
+
instance_variable_set("@#{key}".to_sym, v)
|
23
|
+
next
|
24
|
+
end
|
25
|
+
url = self.class.base_url_for(value[:url], value[:host])
|
26
|
+
|
27
|
+
if value[:paginate]
|
28
|
+
pages = self.class.paginate(url: url, to_replace: value[:paginate][0], pagination: value[:paginate][1])
|
29
|
+
result = { pages: pages }
|
30
|
+
else
|
31
|
+
url = (url.respond_to?(:each) ? url : url.to_s)
|
32
|
+
result = { pages: [url] }
|
33
|
+
end
|
34
|
+
result[:js] = js
|
35
|
+
result[:pattern] = pattern
|
36
|
+
result[:host] = host
|
37
|
+
instance_variable_set("@#{key}".to_sym, result)
|
38
|
+
|
39
|
+
end
|
40
|
+
self.sources = @pages.keys
|
41
|
+
end
|
42
|
+
|
43
|
+
def perform
|
44
|
+
parenthesis_args = /\([\"\'](.*?)[\"\']\)/
|
45
|
+
self.sources.each do |source|
|
46
|
+
|
47
|
+
if @pages[source].key?(:from)
|
48
|
+
expression = @pages[source][:from]
|
49
|
+
statement = {
|
50
|
+
select: expression.match(/select#{parenthesis_args}/)[1],
|
51
|
+
from: expression.match(/from#{parenthesis_args}/)[1],
|
52
|
+
filters: (expression.match(/filter#{parenthesis_args}/)[1].split('|').map(&:strip) rescue nil),
|
53
|
+
}
|
54
|
+
statement[:filters] = ['mapattr_href', 'map_urlencode'] unless statement[:filters].present?
|
55
|
+
urls = Filter.apply(Parser.fetch(statement[:select], self.send(statement[:from])[:pages]), statement[:filters])
|
56
|
+
@pages[source][:pages] = self.class.base_url_for(urls, @pages[source][:host])
|
57
|
+
end
|
58
|
+
src = self.send(source)
|
59
|
+
method = src[:js] ? :js : :normal
|
60
|
+
processes_number = (method == :js ? 1 : @processes )
|
61
|
+
src[:pages] = ::Parallel.map(src[:pages], in_processes: processes_number) do |url_or_array|
|
62
|
+
if url_or_array.respond_to?(:each)
|
63
|
+
res = []
|
64
|
+
url_or_array.each do |url|
|
65
|
+
|
66
|
+
res << self.class.download( url, method )
|
67
|
+
end
|
68
|
+
res
|
69
|
+
else
|
70
|
+
self.class.download( url_or_array, method )
|
71
|
+
end
|
72
|
+
end.flatten
|
73
|
+
end
|
74
|
+
|
75
|
+
result = {}
|
76
|
+
self.sources.each do |source|
|
77
|
+
src = self.send(source)
|
78
|
+
pattern = src[:pattern] ? src[:pattern].dup : false
|
79
|
+
elem = if src[:pattern]
|
80
|
+
src[:pages].map{ |html| Parser.new(html, src[:pattern]).perform }
|
81
|
+
else
|
82
|
+
|
83
|
+
src[:pages]
|
84
|
+
end
|
85
|
+
|
86
|
+
result[source] = elem if pattern
|
87
|
+
instance_variable_set("@#{source}".to_sym, elem )
|
88
|
+
end
|
89
|
+
|
90
|
+
@result = result
|
91
|
+
@json = result
|
92
|
+
end
|
93
|
+
|
94
|
+
def to_json
|
95
|
+
(@json || perform).to_json
|
96
|
+
end
|
97
|
+
|
98
|
+
class << self
|
99
|
+
|
100
|
+
# Fetcher::Fetcher.paginate(url: 'http://site.com', to_replace: '(\/?)$', pagination: '?page=<% 1,5,1 %>')
|
101
|
+
|
102
|
+
def paginate( opts={} )
|
103
|
+
pagination = opts[:pagination] || '?page=<% 1,5,1 %>'
|
104
|
+
to_replace = opts[:to_replace] || '(\/?)\Z'
|
105
|
+
url_or_array_of_urls = opts[:url]
|
106
|
+
raise ArgumentError, "URL parameter missing" if url_or_array_of_urls.nil?
|
107
|
+
regexp = /<%\s?+(\d+,\d+,\d+)\s?+%>/
|
108
|
+
pattern = pagination.scan(regexp)
|
109
|
+
return [opts[:url]] if pattern.count == 0
|
110
|
+
raise ArgumentError, "Only one pagination pattern allowed." if pattern.count > 1
|
111
|
+
result = []
|
112
|
+
pager_args = pattern.first.first.split(',').map(&:strip).map(&:to_i)
|
113
|
+
range = (pager_args[0]..pager_args[1])
|
114
|
+
range.step(pager_args[2]).each do |page|
|
115
|
+
|
116
|
+
to_append = pagination.gsub(regexp, page.to_s)
|
117
|
+
if url_or_array_of_urls.respond_to?(:each)
|
118
|
+
url_or_array_of_urls.each do |url|
|
119
|
+
result << url.to_s.chomp('/').gsub(Regexp.new(to_replace), to_append)
|
120
|
+
end
|
121
|
+
else
|
122
|
+
result << url_or_array_of_urls.to_s.chomp('/').gsub(Regexp.new(to_replace), to_append)
|
123
|
+
end
|
124
|
+
|
125
|
+
end
|
126
|
+
|
127
|
+
result
|
128
|
+
end
|
129
|
+
|
130
|
+
def download( url, method=:normal )
|
131
|
+
sleep @delay if @delay.to_i > 0
|
132
|
+
result = case method
|
133
|
+
when :js
|
134
|
+
headless = Headless.new
|
135
|
+
headless.start
|
136
|
+
browser = Watir::Browser.new
|
137
|
+
browser.goto url
|
138
|
+
html = browser.html
|
139
|
+
headless.destroy
|
140
|
+
print "+"
|
141
|
+
html
|
142
|
+
when :normal
|
143
|
+
begin
|
144
|
+
html = RestClient.get(url,
|
145
|
+
'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
146
|
+
'Accept-Language' => 'ru-RU,ru;q=0.8,en-US;q=0.6,en;q=0.4',
|
147
|
+
'Connection' => 'keep-alive',
|
148
|
+
'User-Agent' => 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/39.0.2171.65 Chrome/39.0.2171.65 Safari/537.36')
|
149
|
+
d = Nokogiri::HTML(html)
|
150
|
+
charset = d.search('meta[@http-equiv="content-type"]').last['content'].match(/charset=(.+)/)[1].downcase rescue nil
|
151
|
+
if charset.present? and charset != 'utf-8'
|
152
|
+
html = html.force_encoding(charset).encode("utf-8", undef: :replace)
|
153
|
+
end
|
154
|
+
print "+"
|
155
|
+
html
|
156
|
+
rescue RestClient::RequestTimeout, RestClient::ResourceNotFound, RestClient::InternalServerError, URI::InvalidURIError, RestClient::Forbidden,RestClient::BadGateway, RestClient
|
157
|
+
print "-"
|
158
|
+
return
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
162
|
+
result
|
163
|
+
end
|
164
|
+
|
165
|
+
def base_url_for(url_or_array, base_url)
|
166
|
+
|
167
|
+
if url_or_array.respond_to?(:each)
|
168
|
+
result = []
|
169
|
+
url_or_array.each do |url|
|
170
|
+
#url = URI(URI.encode(url))
|
171
|
+
url = URI(url)
|
172
|
+
raise ArgumentError, "No host provided." if url.host.nil? and base_url.nil?
|
173
|
+
result << (url.host ? url.to_s : "#{base_url}#{url}")
|
174
|
+
end
|
175
|
+
result
|
176
|
+
else
|
177
|
+
#url = URI(URI.encode(url_or_array))
|
178
|
+
url = URI(url_or_array)
|
179
|
+
raise ArgumentError, "No host provided." if url.host.nil? and base_url.nil?
|
180
|
+
(url.host ? url : "#{base_url}#{url}")
|
181
|
+
end
|
182
|
+
|
183
|
+
end
|
184
|
+
|
185
|
+
end
|
186
|
+
|
187
|
+
|
188
|
+
end
|
189
|
+
end
|
190
|
+
end
|
@@ -0,0 +1,83 @@
|
|
1
|
+
module Apify
|
2
|
+
module Core
|
3
|
+
module Filter
|
4
|
+
class << self
|
5
|
+
|
6
|
+
def apply(node_or_str, filters=[])
|
7
|
+
return node_or_str if filters.empty? or node_or_str.nil?
|
8
|
+
method = filters.shift
|
9
|
+
filtered_value = send(method, node_or_str)
|
10
|
+
apply(filtered_value, filters)
|
11
|
+
end
|
12
|
+
|
13
|
+
private
|
14
|
+
|
15
|
+
def first(node)
|
16
|
+
node.first
|
17
|
+
end
|
18
|
+
|
19
|
+
def text(node)
|
20
|
+
node.text
|
21
|
+
end
|
22
|
+
|
23
|
+
def strip(str)
|
24
|
+
str.strip if str
|
25
|
+
end
|
26
|
+
|
27
|
+
def list(node)
|
28
|
+
node
|
29
|
+
end
|
30
|
+
|
31
|
+
def html(node)
|
32
|
+
node.to_s
|
33
|
+
end
|
34
|
+
|
35
|
+
def inner_html(node)
|
36
|
+
node.inner_html.to_s
|
37
|
+
end
|
38
|
+
|
39
|
+
def map_text(node)
|
40
|
+
node.map(&:text).map(&:strip)
|
41
|
+
end
|
42
|
+
|
43
|
+
def map_html(node)
|
44
|
+
node.map(&:to_s)
|
45
|
+
end
|
46
|
+
|
47
|
+
def map_inner_html(node)
|
48
|
+
node.map(&:inner_html).map(&:to_s)
|
49
|
+
end
|
50
|
+
|
51
|
+
|
52
|
+
def map_urlencode(node)
|
53
|
+
node.map do |url|
|
54
|
+
urlencode(url)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
def urlencode(url)
|
59
|
+
url = begin
|
60
|
+
url = URI(url)
|
61
|
+
url
|
62
|
+
rescue URI::InvalidURIError
|
63
|
+
URI.encode(url)
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
def method_missing(method_sym, *arguments, &block)
|
68
|
+
if method_sym =~ /\Amapattr_/
|
69
|
+
attribute = method_sym.to_s.gsub('mapattr_', '')
|
70
|
+
arguments.first.map{ |n| n[attribute] }
|
71
|
+
elsif method_sym =~ /\Aattr_/
|
72
|
+
attribute = method_sym.to_s.gsub('attr_', '')
|
73
|
+
arguments.first[attribute]
|
74
|
+
else
|
75
|
+
super
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|