gildia_comics_crawler 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +18 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +25 -0
- data/Rakefile +1 -0
- data/gildia_comics_crawler.gemspec +24 -0
- data/lib/gildia_comics_crawler/comic_crawler.rb +87 -0
- data/lib/gildia_comics_crawler/comics_crawler.rb +21 -0
- data/lib/gildia_comics_crawler/crawler.rb +10 -0
- data/lib/gildia_comics_crawler/downloader.rb +27 -0
- data/lib/gildia_comics_crawler/series_crawler.rb +30 -0
- data/lib/gildia_comics_crawler/version.rb +3 -0
- data/lib/gildia_comics_crawler.rb +9 -0
- metadata +99 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 72ddcf2cb680b72acf076be6e2e0281feaf82063
|
4
|
+
data.tar.gz: bddad7c5e780f9c404f4df509e4a1bf7b7beb323
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: ba5098722ed46e65111ce5f770f593411739f15a291853b90755247c6020a050ed8337a203c31250dec41ee23ee02754b6f337a1d19c7b578429f1ec990b5ad3
|
7
|
+
data.tar.gz: 82dd51629e8751f51dffa2515da4cebbabcffaffd7d548582d809d1c562f474b1006e09c9a3fe020ed091f77e71c4d17b74960d5671be67f78f9ecbc4f0d2434
|
data/.gitignore
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2013 Jan Jędrychowski
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
# GildiaComicsCrawler
|
2
|
+
|
3
|
+
komiks.gildia.pl crawler. It can crawl comic database (http://www.komiks.gildia.pl/komiksy)
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Add this line to your application's Gemfile:
|
8
|
+
|
9
|
+
gem 'gildia_comics_crawler'
|
10
|
+
|
11
|
+
And then execute:
|
12
|
+
|
13
|
+
$ bundle
|
14
|
+
|
15
|
+
Or install it yourself as:
|
16
|
+
|
17
|
+
$ gem install gildia_comics_crawler
|
18
|
+
|
19
|
+
## Usage
|
20
|
+
|
21
|
+
require 'gildia_comics_crawler'
|
22
|
+
|
23
|
+
GildiaComicsCrawler::Crawler.new.crawl do |series|
|
24
|
+
...
|
25
|
+
end
|
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
@@ -0,0 +1,24 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'gildia_comics_crawler/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "gildia_comics_crawler"
|
8
|
+
spec.version = GildiaComicsCrawler::VERSION
|
9
|
+
spec.authors = ["Jan Jędrychowski"]
|
10
|
+
spec.email = ["jan@jedrychowski.org"]
|
11
|
+
spec.description = "Crawler for downloading comics from komiks.gildia.pl"
|
12
|
+
spec.summary = spec.description
|
13
|
+
spec.homepage = ""
|
14
|
+
spec.license = "MIT"
|
15
|
+
|
16
|
+
spec.files = `git ls-files`.split($/)
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.add_development_dependency "bundler", "~> 1.3"
|
22
|
+
spec.add_development_dependency "rake"
|
23
|
+
spec.add_runtime_dependency "nokogiri"
|
24
|
+
end
|
@@ -0,0 +1,87 @@
|
|
1
|
+
module GildiaComicsCrawler
|
2
|
+
class ComicCrawler
|
3
|
+
include Downloader
|
4
|
+
|
5
|
+
def initialize link
|
6
|
+
@link = link
|
7
|
+
end
|
8
|
+
|
9
|
+
def crawl
|
10
|
+
@noko = download(@link)
|
11
|
+
@data = {gildia_link: @downloader_uri}
|
12
|
+
@data[:gildia_sklep_link] = @noko.css('#product a.p').first[:href] rescue nil
|
13
|
+
@data[:cover] = @noko.css('.main-article-image').first[:src] rescue nil
|
14
|
+
elements = @noko.css('.widetext').children
|
15
|
+
@data[:title] = elements.css('h1').children.first.text
|
16
|
+
|
17
|
+
@data.merge! first_block_elements(elements)
|
18
|
+
@data.merge! second_block_elements(elements)
|
19
|
+
|
20
|
+
@data
|
21
|
+
end
|
22
|
+
|
23
|
+
private
|
24
|
+
|
25
|
+
FIRST_BLOCK_SECTIONS = {
|
26
|
+
"Scenariusz:" => :scenario,
|
27
|
+
"Rysunek:" => :drawing,
|
28
|
+
"Tłumaczenie:" => :translation
|
29
|
+
}
|
30
|
+
|
31
|
+
def first_block_elements elements
|
32
|
+
found_elements = []
|
33
|
+
elements.each do |el|
|
34
|
+
break if el.name == 'div' && !el[:class]
|
35
|
+
found_elements << el if el.name == 'text' || el.name == 'a'
|
36
|
+
end
|
37
|
+
extracted_data = found_elements.map(&:text).map(&:strip).reject { |t| t.empty? || t == ',' }
|
38
|
+
data = {}
|
39
|
+
current_section = nil
|
40
|
+
extracted_data.each do |text|
|
41
|
+
if (new_section=FIRST_BLOCK_SECTIONS[text])
|
42
|
+
current_section = new_section
|
43
|
+
else
|
44
|
+
data[current_section] ||= []
|
45
|
+
data[current_section] << text
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
data
|
50
|
+
end
|
51
|
+
|
52
|
+
SECOND_BLOCK_SECTIONS = {
|
53
|
+
"Tytuł oryginalny" => :origin_title,
|
54
|
+
"Wydawca oryginalny" => :origin_publisher,
|
55
|
+
"Rok wydania oryginału:" => :origin_publish_year,
|
56
|
+
"Liczba stron" => :pages_count,
|
57
|
+
"Oprawa" => :binding,
|
58
|
+
"Papier" => :paper,
|
59
|
+
"Druk" => :color,
|
60
|
+
"ISBN-13" => :isbn,
|
61
|
+
"Wydanie" => :edition,
|
62
|
+
"Cena z okładki" => :original_price
|
63
|
+
}
|
64
|
+
|
65
|
+
def second_block_elements elements
|
66
|
+
data = {}
|
67
|
+
extracted_data = elements.filter('div:not([class])')[1].children.map(&:text).map(&:strip).reject(&:empty?)
|
68
|
+
if extracted_data[0] == 'Wydawnictwo:'
|
69
|
+
data[:publisher] = extracted_data[1]
|
70
|
+
extracted_data.shift(2)
|
71
|
+
end
|
72
|
+
if extracted_data[0] =~ /^\d+\/\d+$/
|
73
|
+
month, year = extracted_data.shift.split('/')
|
74
|
+
data[:publish_date_month] = month
|
75
|
+
data[:publish_date_year] = year
|
76
|
+
end
|
77
|
+
|
78
|
+
extracted_data.each do |line|
|
79
|
+
attr, val = line.split(':', 2)
|
80
|
+
if(attr_sym=SECOND_BLOCK_SECTIONS[attr])
|
81
|
+
data[attr_sym] = val.strip
|
82
|
+
end
|
83
|
+
end
|
84
|
+
data
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
module GildiaComicsCrawler
|
2
|
+
class ComicsCrawler
|
3
|
+
include Downloader
|
4
|
+
def initialize base_url
|
5
|
+
@base_url = base_url
|
6
|
+
end
|
7
|
+
|
8
|
+
def all_comics
|
9
|
+
comics_list.map do |link|
|
10
|
+
ComicCrawler.new(link).crawl
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
def comics_list
|
15
|
+
download(@base_url).css('.CDgallery a').map do |link|
|
16
|
+
link[:href]
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'open-uri'
|
3
|
+
|
4
|
+
module GildiaComicsCrawler
|
5
|
+
module Downloader
|
6
|
+
BASE_URI = 'http://www.komiks.gildia.pl/'
|
7
|
+
|
8
|
+
class Base
|
9
|
+
def initialize uri
|
10
|
+
uri = URI::join(BASE_URI, uri) unless uri.start_with?('http://')
|
11
|
+
@uri = uri
|
12
|
+
end
|
13
|
+
|
14
|
+
attr_accessor :uri
|
15
|
+
|
16
|
+
def download
|
17
|
+
Nokogiri::HTML(open(@uri))
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def download uri
|
22
|
+
downloader = Base.new(uri)
|
23
|
+
@downloader_uri = downloader.uri.to_s
|
24
|
+
downloader.download
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module GildiaComicsCrawler
|
2
|
+
class SeriesCrawler
|
3
|
+
include Downloader
|
4
|
+
SERIES_BASE_URL = 'komiksy'
|
5
|
+
|
6
|
+
def all_series
|
7
|
+
find_pages.flat_map do |page|
|
8
|
+
series_from_page(page)
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
def series_from_page page
|
13
|
+
download(page).css('.long-list a').map do |serie_link|
|
14
|
+
{
|
15
|
+
name: serie_link.text,
|
16
|
+
link: serie_link[:href]
|
17
|
+
}
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def find_pages
|
22
|
+
@pages ||= begin
|
23
|
+
download(SERIES_BASE_URL).css('.header-letters a').map do |letter|
|
24
|
+
letter[:href]
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,9 @@
|
|
1
|
+
require "gildia_comics_crawler/version"
|
2
|
+
require "gildia_comics_crawler/downloader"
|
3
|
+
require "gildia_comics_crawler/series_crawler"
|
4
|
+
require "gildia_comics_crawler/comic_crawler"
|
5
|
+
require "gildia_comics_crawler/comics_crawler"
|
6
|
+
require "gildia_comics_crawler/crawler"
|
7
|
+
|
8
|
+
module GildiaComicsCrawler
|
9
|
+
end
|
metadata
ADDED
@@ -0,0 +1,99 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: gildia_comics_crawler
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Jan Jędrychowski
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2013-12-21 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: bundler
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ~>
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.3'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ~>
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.3'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rake
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - '>='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - '>='
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: nokogiri
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - '>='
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
description: Crawler for downloading comics from komiks.gildia.pl
|
56
|
+
email:
|
57
|
+
- jan@jedrychowski.org
|
58
|
+
executables: []
|
59
|
+
extensions: []
|
60
|
+
extra_rdoc_files: []
|
61
|
+
files:
|
62
|
+
- .gitignore
|
63
|
+
- Gemfile
|
64
|
+
- LICENSE.txt
|
65
|
+
- README.md
|
66
|
+
- Rakefile
|
67
|
+
- gildia_comics_crawler.gemspec
|
68
|
+
- lib/gildia_comics_crawler.rb
|
69
|
+
- lib/gildia_comics_crawler/comic_crawler.rb
|
70
|
+
- lib/gildia_comics_crawler/comics_crawler.rb
|
71
|
+
- lib/gildia_comics_crawler/crawler.rb
|
72
|
+
- lib/gildia_comics_crawler/downloader.rb
|
73
|
+
- lib/gildia_comics_crawler/series_crawler.rb
|
74
|
+
- lib/gildia_comics_crawler/version.rb
|
75
|
+
homepage: ''
|
76
|
+
licenses:
|
77
|
+
- MIT
|
78
|
+
metadata: {}
|
79
|
+
post_install_message:
|
80
|
+
rdoc_options: []
|
81
|
+
require_paths:
|
82
|
+
- lib
|
83
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
84
|
+
requirements:
|
85
|
+
- - '>='
|
86
|
+
- !ruby/object:Gem::Version
|
87
|
+
version: '0'
|
88
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
89
|
+
requirements:
|
90
|
+
- - '>='
|
91
|
+
- !ruby/object:Gem::Version
|
92
|
+
version: '0'
|
93
|
+
requirements: []
|
94
|
+
rubyforge_project:
|
95
|
+
rubygems_version: 2.0.14
|
96
|
+
signing_key:
|
97
|
+
specification_version: 4
|
98
|
+
summary: Crawler for downloading comics from komiks.gildia.pl
|
99
|
+
test_files: []
|