taiwan_tours 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +10 -0
- data/.travis.yml +16 -0
- data/Gemfile +12 -0
- data/LICENSE.txt +21 -0
- data/README.md +43 -0
- data/Rakefile +8 -0
- data/bin/taiwan_tours +16 -0
- data/lib/.DS_Store +0 -0
- data/lib/lonelyplanet_scrap_test.rb +52 -0
- data/lib/taiwan_tours/lonelyplanet_scrap.rb +52 -0
- data/lib/taiwan_tours/version.rb +5 -0
- data/lib/taiwan_tours.rb +2 -0
- data/spec/fixtures/vcr_cassettes/taiwan_tours.yml +2576 -0
- data/spec/lonelyplanet_spec.rb +38 -0
- data/spec/support/vcr_setup.rb +4 -0
- data/spec/tours.yml +116 -0
- data/taiwan_tours.gemspec +23 -0
- metadata +142 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 2e64121574b9a2cbacc8da265b95fdd1921e9671
|
4
|
+
data.tar.gz: 3e585b8ba243438ee3b007ffd1d214c2d81f6a02
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 55193252cc08899017ac0b8bad0157986b3efc452dcb5dc2789b1522425c513723c3fbdc1fede4a1a8568d18b2f889dcc1a44e9d9febe8070dbba6efa1b072fc
|
7
|
+
data.tar.gz: f7109e3c3f637f9598dc1bc8dda6c80efdce811a2d48817ef8cae7aecd949734ef1efdc1ae39760e18d776d46afad287efd7409c8c345fe217f2af8d06a1e855
|
data/.gitignore
ADDED
data/.travis.yml
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
language: ruby
|
2
|
+
rvm:
|
3
|
+
- ruby-head
|
4
|
+
- "2.2.3"
|
5
|
+
- "2.1.0"
|
6
|
+
- "1.9.3"
|
7
|
+
- jruby-head
|
8
|
+
- jruby-19mode
|
9
|
+
matrix:
|
10
|
+
allow_failures:
|
11
|
+
- rvm: jruby-head
|
12
|
+
branches:
|
13
|
+
only:
|
14
|
+
- master
|
15
|
+
# uncomment this line if your project needs to run something other than `rake`:
|
16
|
+
# script: bundle exec rspec spec
|
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2015 ZhongMeiZhou https://github.com/ZhongMeiZhou
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
# taiwan_tours [![Build Status](https://travis-ci.org/ZhongMeiZhou/scraper_project.svg)](https://travis-ci.org/ZhongMeiZhou/scraper_project)
|
2
|
+
|
3
|
+
The lonelyplanet web scraper service traverses details of Taiwan tour packages in a easiest way and with simple steps.
|
4
|
+
|
5
|
+
Our company entirely respect the privacy policies of our partner Lonelyplanet declared on 'robots.txt' file.
|
6
|
+
|
7
|
+
|
8
|
+
## Gem Usage
|
9
|
+
|
10
|
+
Install our fantastic gem using the following simple command on your command line:
|
11
|
+
|
12
|
+
```sh
|
13
|
+
$ gem install taiwan_tours
|
14
|
+
```
|
15
|
+
|
16
|
+
either, as our gem is published by Rubygems.org you can also include it in your 'Gemfile' as:
|
17
|
+
|
18
|
+
```ruby
|
19
|
+
gem taiwan_tours
|
20
|
+
```
|
21
|
+
|
22
|
+
## Try it yourself!
|
23
|
+
Run it from the command line as follow:
|
24
|
+
|
25
|
+
```sh
|
26
|
+
$ taiwan_tours
|
27
|
+
```
|
28
|
+
|
29
|
+
or it can also be include it in your own Ruby code with the steps:
|
30
|
+
|
31
|
+
```ruby
|
32
|
+
require 'taiwan_tours'
|
33
|
+
taiwan = LonelyPlanetScrape::LonelyPlanetTours.new
|
34
|
+
tours = taiwan.tours
|
35
|
+
```
|
36
|
+
|
37
|
+
## Want to make improvements?
|
38
|
+
|
39
|
+
1. Fork it ( https://github.com/ZhongMeiZhou/scraper_project )
|
40
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
41
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
42
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
43
|
+
5. Create a new Pull Request
|
data/Rakefile
ADDED
data/bin/taiwan_tours
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'json'
|
3
|
+
require './lib/taiwan_tours/lonelyplanet_scrap'
|
4
|
+
|
5
|
+
begin
|
6
|
+
taiwan_tours = LonelyPlanetScrape::LonelyPlanetTours.new
|
7
|
+
tour_arr = JSON.parse(taiwan_tours.tours)
|
8
|
+
puts 'According to LonelyPlanet, these are the best tour packages in Taiwan:'
|
9
|
+
|
10
|
+
tour_arr.each do |hash|
|
11
|
+
puts "- #{hash['title']} for $#{hash['price']}"
|
12
|
+
end
|
13
|
+
|
14
|
+
rescue => e
|
15
|
+
puts "Error occured - see details: #{e}"
|
16
|
+
end
|
data/lib/.DS_Store
ADDED
Binary file
|
@@ -0,0 +1,52 @@
|
|
1
|
+
require 'oga'
|
2
|
+
require 'open-uri'
|
3
|
+
require 'json'
|
4
|
+
require './lib/taiwan_tours/lonelyplanet_scrap'
|
5
|
+
# Test class practice for handling errors and checking for HTML structure changes
|
6
|
+
# can be used to test functionality before implementing it in Scraper Class
|
7
|
+
class LonelyPlanetToursTest < LonelyPlanetScrape::LonelyPlanetTours
|
8
|
+
# override super class initialize function
|
9
|
+
def initialize
|
10
|
+
test_parse_html
|
11
|
+
end
|
12
|
+
|
13
|
+
def test_tour
|
14
|
+
@tours ||= test_html_extraction
|
15
|
+
end
|
16
|
+
|
17
|
+
private
|
18
|
+
|
19
|
+
# test connection to external site
|
20
|
+
def test_parse_html
|
21
|
+
parse_html
|
22
|
+
rescue OpenURI::HTTPError => e
|
23
|
+
puts "HTTP request error: #{e}"
|
24
|
+
end
|
25
|
+
|
26
|
+
# test scraping service and also test for changes in structure of elements being traversed
|
27
|
+
def test_html_extraction
|
28
|
+
result = []
|
29
|
+
fail 'OOPS, root article may have been changed or removed' if @document.xpath(TOUR_XPATH_CARD).text.empty?
|
30
|
+
fail 'Title h1 tag may have been changed or removed' if @document.xpath(CARD_TITLE_XPATH).text.empty?
|
31
|
+
fail 'Price span tag may have been changed or removed' if @document.xpath(CARD_PRICE_AMOUNT_XPATH).text.empty?
|
32
|
+
fail 'Content div tag may have been changed or removed' if @document.xpath(CARD_CONTENT_XPATH).text.empty?
|
33
|
+
|
34
|
+
@document.xpath(TOUR_XPATH_CARD).map do |card|
|
35
|
+
element = {}
|
36
|
+
element['img'] = card.xpath(CARD_IMGLINK_XPATH).text
|
37
|
+
element['title'] = card.xpath(CARD_TITLE_XPATH).text.strip
|
38
|
+
element['content'] = card.xpath(CARD_CONTENT_XPATH).text.strip
|
39
|
+
element['location'] = card.xpath(CARD_LOCATION_XPATH).text
|
40
|
+
element['price_currency'] = card.xpath(CARD_PRICE_CURRENCY_XPATH).text
|
41
|
+
element['price'] = card.xpath(CARD_PRICE_AMOUNT_XPATH).text
|
42
|
+
result << element
|
43
|
+
end
|
44
|
+
result.to_json
|
45
|
+
rescue StandardError => e
|
46
|
+
puts e.message
|
47
|
+
puts e.backtrace.inspect
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
test_run = LonelyPlanetToursTest.new
|
52
|
+
puts test_run.test_tour
|
@@ -0,0 +1,52 @@
|
|
1
|
+
require 'oga'
|
2
|
+
require 'open-uri'
|
3
|
+
require 'json'
|
4
|
+
|
5
|
+
# Module defines LonelyPlanetTours class which handles scraping of lonelyplanet Taiwan tours page
|
6
|
+
module LonelyPlanetScrape
|
7
|
+
class LonelyPlanetTours
|
8
|
+
# Define constants needed for scraping
|
9
|
+
# Please if the test for uri fails replace contant manually
|
10
|
+
LONELYPLANET_URL = 'http://www.lonelyplanet.com'
|
11
|
+
TOUR_RELATIVE_DIR = 'taiwan/tours'
|
12
|
+
|
13
|
+
TOUR_XPATH_CARD = "//article[contains(@class,'card')]"
|
14
|
+
CARD_IMGLINK_XPATH = ".//img[contains(@class,'card__figure__img')]/@src"
|
15
|
+
CARD_TITLE_XPATH = './/h1'
|
16
|
+
CARD_CONTENT_XPATH = ".//div[contains(@class,'card__content__desc')]//p"
|
17
|
+
CARD_LINK_XPATH = ".//div[contains(@class,'card__mask')]//a"
|
18
|
+
CARD_LOCATION_XPATH = ".//div[contains(@class,'card__footer__locale')]"
|
19
|
+
CARD_PRICE_CURRENCY_XPATH = ".//span[contains(@class,'js-currency')]"
|
20
|
+
CARD_PRICE_AMOUNT_XPATH = ".//span[contains(@class,'js-price')]"
|
21
|
+
|
22
|
+
def initialize
|
23
|
+
parse_html
|
24
|
+
end
|
25
|
+
|
26
|
+
def tours
|
27
|
+
@tours ||= extract_tours
|
28
|
+
end
|
29
|
+
|
30
|
+
private
|
31
|
+
|
32
|
+
def parse_html
|
33
|
+
url = "#{LONELYPLANET_URL}/#{TOUR_RELATIVE_DIR}"
|
34
|
+
@document = Oga.parse_html(open(url))
|
35
|
+
end
|
36
|
+
|
37
|
+
def extract_tours
|
38
|
+
result = []
|
39
|
+
@document.xpath(TOUR_XPATH_CARD).map do |card|
|
40
|
+
element = {}
|
41
|
+
element['img'] = card.xpath(CARD_IMGLINK_XPATH).text
|
42
|
+
element['title'] = card.xpath(CARD_TITLE_XPATH).text.strip
|
43
|
+
element['content'] = card.xpath(CARD_CONTENT_XPATH).text.strip
|
44
|
+
element['location'] = card.xpath(CARD_LOCATION_XPATH).text
|
45
|
+
element['price_currency'] = card.xpath(CARD_PRICE_CURRENCY_XPATH).text
|
46
|
+
element['price'] = card.xpath(CARD_PRICE_AMOUNT_XPATH).text
|
47
|
+
result << element
|
48
|
+
end
|
49
|
+
result.to_json
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
data/lib/taiwan_tours.rb
ADDED