gildia_comics_crawler 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 72ddcf2cb680b72acf076be6e2e0281feaf82063
4
+ data.tar.gz: bddad7c5e780f9c404f4df509e4a1bf7b7beb323
5
+ SHA512:
6
+ metadata.gz: ba5098722ed46e65111ce5f770f593411739f15a291853b90755247c6020a050ed8337a203c31250dec41ee23ee02754b6f337a1d19c7b578429f1ec990b5ad3
7
+ data.tar.gz: 82dd51629e8751f51dffa2515da4cebbabcffaffd7d548582d809d1c562f474b1006e09c9a3fe020ed091f77e71c4d17b74960d5671be67f78f9ecbc4f0d2434
data/.gitignore ADDED
@@ -0,0 +1,18 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
18
+ .idea/
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in gildia_comics_crawler.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Jan Jędrychowski
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,25 @@
1
+ # GildiaComicsCrawler
2
+
3
+ komiks.gildia.pl crawler. It can crawl comic database (http://www.komiks.gildia.pl/komiksy)
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'gildia_comics_crawler'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install gildia_comics_crawler
18
+
19
+ ## Usage
20
+
21
+ require 'gildia_comics_crawler'
22
+
23
+ GildiaComicsCrawler::Crawler.new.crawl do |series|
24
+ ...
25
+ end
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
@@ -0,0 +1,24 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'gildia_comics_crawler/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "gildia_comics_crawler"
8
+ spec.version = GildiaComicsCrawler::VERSION
9
+ spec.authors = ["Jan Jędrychowski"]
10
+ spec.email = ["jan@jedrychowski.org"]
11
+ spec.description = "Crawler for downloading comics from komiks.gildia.pl"
12
+ spec.summary = spec.description
13
+ spec.homepage = ""
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files`.split($/)
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.3"
22
+ spec.add_development_dependency "rake"
23
+ spec.add_runtime_dependency "nokogiri"
24
+ end
@@ -0,0 +1,87 @@
1
+ module GildiaComicsCrawler
2
+ class ComicCrawler
3
+ include Downloader
4
+
5
+ def initialize link
6
+ @link = link
7
+ end
8
+
9
+ def crawl
10
+ @noko = download(@link)
11
+ @data = {gildia_link: @downloader_uri}
12
+ @data[:gildia_sklep_link] = @noko.css('#product a.p').first[:href] rescue nil
13
+ @data[:cover] = @noko.css('.main-article-image').first[:src] rescue nil
14
+ elements = @noko.css('.widetext').children
15
+ @data[:title] = elements.css('h1').children.first.text
16
+
17
+ @data.merge! first_block_elements(elements)
18
+ @data.merge! second_block_elements(elements)
19
+
20
+ @data
21
+ end
22
+
23
+ private
24
+
25
+ FIRST_BLOCK_SECTIONS = {
26
+ "Scenariusz:" => :scenario,
27
+ "Rysunek:" => :drawing,
28
+ "Tłumaczenie:" => :translation
29
+ }
30
+
31
+ def first_block_elements elements
32
+ found_elements = []
33
+ elements.each do |el|
34
+ break if el.name == 'div' && !el[:class]
35
+ found_elements << el if el.name == 'text' || el.name == 'a'
36
+ end
37
+ extracted_data = found_elements.map(&:text).map(&:strip).reject { |t| t.empty? || t == ',' }
38
+ data = {}
39
+ current_section = nil
40
+ extracted_data.each do |text|
41
+ if (new_section=FIRST_BLOCK_SECTIONS[text])
42
+ current_section = new_section
43
+ else
44
+ data[current_section] ||= []
45
+ data[current_section] << text
46
+ end
47
+ end
48
+
49
+ data
50
+ end
51
+
52
+ SECOND_BLOCK_SECTIONS = {
53
+ "Tytuł oryginalny" => :origin_title,
54
+ "Wydawca oryginalny" => :origin_publisher,
55
+ "Rok wydania oryginału:" => :origin_publish_year,
56
+ "Liczba stron" => :pages_count,
57
+ "Oprawa" => :binding,
58
+ "Papier" => :paper,
59
+ "Druk" => :color,
60
+ "ISBN-13" => :isbn,
61
+ "Wydanie" => :edition,
62
+ "Cena z okładki" => :original_price
63
+ }
64
+
65
+ def second_block_elements elements
66
+ data = {}
67
+ extracted_data = elements.filter('div:not([class])')[1].children.map(&:text).map(&:strip).reject(&:empty?)
68
+ if extracted_data[0] == 'Wydawnictwo:'
69
+ data[:publisher] = extracted_data[1]
70
+ extracted_data.shift(2)
71
+ end
72
+ if extracted_data[0] =~ /^\d+\/\d+$/
73
+ month, year = extracted_data.shift.split('/')
74
+ data[:publish_date_month] = month
75
+ data[:publish_date_year] = year
76
+ end
77
+
78
+ extracted_data.each do |line|
79
+ attr, val = line.split(':', 2)
80
+ if(attr_sym=SECOND_BLOCK_SECTIONS[attr])
81
+ data[attr_sym] = val.strip
82
+ end
83
+ end
84
+ data
85
+ end
86
+ end
87
+ end
@@ -0,0 +1,21 @@
1
+ module GildiaComicsCrawler
2
+ class ComicsCrawler
3
+ include Downloader
4
+ def initialize base_url
5
+ @base_url = base_url
6
+ end
7
+
8
+ def all_comics
9
+ comics_list.map do |link|
10
+ ComicCrawler.new(link).crawl
11
+ end
12
+ end
13
+
14
+ def comics_list
15
+ download(@base_url).css('.CDgallery a').map do |link|
16
+ link[:href]
17
+ end
18
+ end
19
+ end
20
+
21
+ end
@@ -0,0 +1,10 @@
1
+ module GildiaComicsCrawler
2
+ class Crawler
3
+ def crawl
4
+ SeriesCrawler.new.all_series.each do |series|
5
+ comics = ComicsCrawler.new(series[:link]).all_comics
6
+ yield series.merge(comics: comics)
7
+ end
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,27 @@
1
+ require 'nokogiri'
2
+ require 'open-uri'
3
+
4
+ module GildiaComicsCrawler
5
+ module Downloader
6
+ BASE_URI = 'http://www.komiks.gildia.pl/'
7
+
8
+ class Base
9
+ def initialize uri
10
+ uri = URI::join(BASE_URI, uri) unless uri.start_with?('http://')
11
+ @uri = uri
12
+ end
13
+
14
+ attr_accessor :uri
15
+
16
+ def download
17
+ Nokogiri::HTML(open(@uri))
18
+ end
19
+ end
20
+
21
+ def download uri
22
+ downloader = Base.new(uri)
23
+ @downloader_uri = downloader.uri.to_s
24
+ downloader.download
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,30 @@
1
+ module GildiaComicsCrawler
2
+ class SeriesCrawler
3
+ include Downloader
4
+ SERIES_BASE_URL = 'komiksy'
5
+
6
+ def all_series
7
+ find_pages.flat_map do |page|
8
+ series_from_page(page)
9
+ end
10
+ end
11
+
12
+ def series_from_page page
13
+ download(page).css('.long-list a').map do |serie_link|
14
+ {
15
+ name: serie_link.text,
16
+ link: serie_link[:href]
17
+ }
18
+ end
19
+ end
20
+
21
+ def find_pages
22
+ @pages ||= begin
23
+ download(SERIES_BASE_URL).css('.header-letters a').map do |letter|
24
+ letter[:href]
25
+ end
26
+ end
27
+ end
28
+
29
+ end
30
+ end
@@ -0,0 +1,3 @@
1
+ module GildiaComicsCrawler
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,9 @@
1
+ require "gildia_comics_crawler/version"
2
+ require "gildia_comics_crawler/downloader"
3
+ require "gildia_comics_crawler/series_crawler"
4
+ require "gildia_comics_crawler/comic_crawler"
5
+ require "gildia_comics_crawler/comics_crawler"
6
+ require "gildia_comics_crawler/crawler"
7
+
8
+ module GildiaComicsCrawler
9
+ end
metadata ADDED
@@ -0,0 +1,99 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: gildia_comics_crawler
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Jan Jędrychowski
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2013-12-21 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ~>
18
+ - !ruby/object:Gem::Version
19
+ version: '1.3'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ~>
25
+ - !ruby/object:Gem::Version
26
+ version: '1.3'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: nokogiri
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ description: Crawler for downloading comics from komiks.gildia.pl
56
+ email:
57
+ - jan@jedrychowski.org
58
+ executables: []
59
+ extensions: []
60
+ extra_rdoc_files: []
61
+ files:
62
+ - .gitignore
63
+ - Gemfile
64
+ - LICENSE.txt
65
+ - README.md
66
+ - Rakefile
67
+ - gildia_comics_crawler.gemspec
68
+ - lib/gildia_comics_crawler.rb
69
+ - lib/gildia_comics_crawler/comic_crawler.rb
70
+ - lib/gildia_comics_crawler/comics_crawler.rb
71
+ - lib/gildia_comics_crawler/crawler.rb
72
+ - lib/gildia_comics_crawler/downloader.rb
73
+ - lib/gildia_comics_crawler/series_crawler.rb
74
+ - lib/gildia_comics_crawler/version.rb
75
+ homepage: ''
76
+ licenses:
77
+ - MIT
78
+ metadata: {}
79
+ post_install_message:
80
+ rdoc_options: []
81
+ require_paths:
82
+ - lib
83
+ required_ruby_version: !ruby/object:Gem::Requirement
84
+ requirements:
85
+ - - '>='
86
+ - !ruby/object:Gem::Version
87
+ version: '0'
88
+ required_rubygems_version: !ruby/object:Gem::Requirement
89
+ requirements:
90
+ - - '>='
91
+ - !ruby/object:Gem::Version
92
+ version: '0'
93
+ requirements: []
94
+ rubyforge_project:
95
+ rubygems_version: 2.0.14
96
+ signing_key:
97
+ specification_version: 4
98
+ summary: Crawler for downloading comics from komiks.gildia.pl
99
+ test_files: []