gildia_comics_crawler 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 72ddcf2cb680b72acf076be6e2e0281feaf82063
4
+ data.tar.gz: bddad7c5e780f9c404f4df509e4a1bf7b7beb323
5
+ SHA512:
6
+ metadata.gz: ba5098722ed46e65111ce5f770f593411739f15a291853b90755247c6020a050ed8337a203c31250dec41ee23ee02754b6f337a1d19c7b578429f1ec990b5ad3
7
+ data.tar.gz: 82dd51629e8751f51dffa2515da4cebbabcffaffd7d548582d809d1c562f474b1006e09c9a3fe020ed091f77e71c4d17b74960d5671be67f78f9ecbc4f0d2434
data/.gitignore ADDED
@@ -0,0 +1,18 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
18
+ .idea/
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in gildia_comics_crawler.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Jan Jędrychowski
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,25 @@
1
+ # GildiaComicsCrawler
2
+
3
+ komiks.gildia.pl crawler. It can crawl comic database (http://www.komiks.gildia.pl/komiksy)
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'gildia_comics_crawler'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install gildia_comics_crawler
18
+
19
+ ## Usage
20
+
21
+ require 'gildia_comics_crawler'
22
+
23
+ GildiaComicsCrawler::Crawler.new.crawl do |series|
24
+ ...
25
+ end
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
@@ -0,0 +1,24 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'gildia_comics_crawler/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "gildia_comics_crawler"
8
+ spec.version = GildiaComicsCrawler::VERSION
9
+ spec.authors = ["Jan Jędrychowski"]
10
+ spec.email = ["jan@jedrychowski.org"]
11
+ spec.description = "Crawler for downloading comics from komiks.gildia.pl"
12
+ spec.summary = spec.description
13
+ spec.homepage = ""
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files`.split($/)
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.3"
22
+ spec.add_development_dependency "rake"
23
+ spec.add_runtime_dependency "nokogiri"
24
+ end
@@ -0,0 +1,87 @@
1
+ module GildiaComicsCrawler
2
+ class ComicCrawler
3
+ include Downloader
4
+
5
+ def initialize link
6
+ @link = link
7
+ end
8
+
9
+ def crawl
10
+ @noko = download(@link)
11
+ @data = {gildia_link: @downloader_uri}
12
+ @data[:gildia_sklep_link] = @noko.css('#product a.p').first[:href] rescue nil
13
+ @data[:cover] = @noko.css('.main-article-image').first[:src] rescue nil
14
+ elements = @noko.css('.widetext').children
15
+ @data[:title] = elements.css('h1').children.first.text
16
+
17
+ @data.merge! first_block_elements(elements)
18
+ @data.merge! second_block_elements(elements)
19
+
20
+ @data
21
+ end
22
+
23
+ private
24
+
25
+ FIRST_BLOCK_SECTIONS = {
26
+ "Scenariusz:" => :scenario,
27
+ "Rysunek:" => :drawing,
28
+ "Tłumaczenie:" => :translation
29
+ }
30
+
31
+ def first_block_elements elements
32
+ found_elements = []
33
+ elements.each do |el|
34
+ break if el.name == 'div' && !el[:class]
35
+ found_elements << el if el.name == 'text' || el.name == 'a'
36
+ end
37
+ extracted_data = found_elements.map(&:text).map(&:strip).reject { |t| t.empty? || t == ',' }
38
+ data = {}
39
+ current_section = nil
40
+ extracted_data.each do |text|
41
+ if (new_section=FIRST_BLOCK_SECTIONS[text])
42
+ current_section = new_section
43
+ else
44
+ data[current_section] ||= []
45
+ data[current_section] << text
46
+ end
47
+ end
48
+
49
+ data
50
+ end
51
+
52
+ SECOND_BLOCK_SECTIONS = {
53
+ "Tytuł oryginalny" => :origin_title,
54
+ "Wydawca oryginalny" => :origin_publisher,
55
+ "Rok wydania oryginału:" => :origin_publish_year,
56
+ "Liczba stron" => :pages_count,
57
+ "Oprawa" => :binding,
58
+ "Papier" => :paper,
59
+ "Druk" => :color,
60
+ "ISBN-13" => :isbn,
61
+ "Wydanie" => :edition,
62
+ "Cena z okładki" => :original_price
63
+ }
64
+
65
+ def second_block_elements elements
66
+ data = {}
67
+ extracted_data = elements.filter('div:not([class])')[1].children.map(&:text).map(&:strip).reject(&:empty?)
68
+ if extracted_data[0] == 'Wydawnictwo:'
69
+ data[:publisher] = extracted_data[1]
70
+ extracted_data.shift(2)
71
+ end
72
+ if extracted_data[0] =~ /^\d+\/\d+$/
73
+ month, year = extracted_data.shift.split('/')
74
+ data[:publish_date_month] = month
75
+ data[:publish_date_year] = year
76
+ end
77
+
78
+ extracted_data.each do |line|
79
+ attr, val = line.split(':', 2)
80
+ if(attr_sym=SECOND_BLOCK_SECTIONS[attr])
81
+ data[attr_sym] = val.strip
82
+ end
83
+ end
84
+ data
85
+ end
86
+ end
87
+ end
@@ -0,0 +1,21 @@
1
+ module GildiaComicsCrawler
2
+ class ComicsCrawler
3
+ include Downloader
4
+ def initialize base_url
5
+ @base_url = base_url
6
+ end
7
+
8
+ def all_comics
9
+ comics_list.map do |link|
10
+ ComicCrawler.new(link).crawl
11
+ end
12
+ end
13
+
14
+ def comics_list
15
+ download(@base_url).css('.CDgallery a').map do |link|
16
+ link[:href]
17
+ end
18
+ end
19
+ end
20
+
21
+ end
@@ -0,0 +1,10 @@
1
+ module GildiaComicsCrawler
2
+ class Crawler
3
+ def crawl
4
+ SeriesCrawler.new.all_series.each do |series|
5
+ comics = ComicsCrawler.new(series[:link]).all_comics
6
+ yield series.merge(comics: comics)
7
+ end
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,27 @@
1
+ require 'nokogiri'
2
+ require 'open-uri'
3
+
4
+ module GildiaComicsCrawler
5
+ module Downloader
6
+ BASE_URI = 'http://www.komiks.gildia.pl/'
7
+
8
+ class Base
9
+ def initialize uri
10
+ uri = URI::join(BASE_URI, uri) unless uri.start_with?('http://')
11
+ @uri = uri
12
+ end
13
+
14
+ attr_accessor :uri
15
+
16
+ def download
17
+ Nokogiri::HTML(open(@uri))
18
+ end
19
+ end
20
+
21
+ def download uri
22
+ downloader = Base.new(uri)
23
+ @downloader_uri = downloader.uri.to_s
24
+ downloader.download
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,30 @@
1
+ module GildiaComicsCrawler
2
+ class SeriesCrawler
3
+ include Downloader
4
+ SERIES_BASE_URL = 'komiksy'
5
+
6
+ def all_series
7
+ find_pages.flat_map do |page|
8
+ series_from_page(page)
9
+ end
10
+ end
11
+
12
+ def series_from_page page
13
+ download(page).css('.long-list a').map do |serie_link|
14
+ {
15
+ name: serie_link.text,
16
+ link: serie_link[:href]
17
+ }
18
+ end
19
+ end
20
+
21
+ def find_pages
22
+ @pages ||= begin
23
+ download(SERIES_BASE_URL).css('.header-letters a').map do |letter|
24
+ letter[:href]
25
+ end
26
+ end
27
+ end
28
+
29
+ end
30
+ end
@@ -0,0 +1,3 @@
1
+ module GildiaComicsCrawler
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,9 @@
1
+ require "gildia_comics_crawler/version"
2
+ require "gildia_comics_crawler/downloader"
3
+ require "gildia_comics_crawler/series_crawler"
4
+ require "gildia_comics_crawler/comic_crawler"
5
+ require "gildia_comics_crawler/comics_crawler"
6
+ require "gildia_comics_crawler/crawler"
7
+
8
+ module GildiaComicsCrawler
9
+ end
metadata ADDED
@@ -0,0 +1,99 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: gildia_comics_crawler
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Jan Jędrychowski
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2013-12-21 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ~>
18
+ - !ruby/object:Gem::Version
19
+ version: '1.3'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ~>
25
+ - !ruby/object:Gem::Version
26
+ version: '1.3'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: nokogiri
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ description: Crawler for downloading comics from komiks.gildia.pl
56
+ email:
57
+ - jan@jedrychowski.org
58
+ executables: []
59
+ extensions: []
60
+ extra_rdoc_files: []
61
+ files:
62
+ - .gitignore
63
+ - Gemfile
64
+ - LICENSE.txt
65
+ - README.md
66
+ - Rakefile
67
+ - gildia_comics_crawler.gemspec
68
+ - lib/gildia_comics_crawler.rb
69
+ - lib/gildia_comics_crawler/comic_crawler.rb
70
+ - lib/gildia_comics_crawler/comics_crawler.rb
71
+ - lib/gildia_comics_crawler/crawler.rb
72
+ - lib/gildia_comics_crawler/downloader.rb
73
+ - lib/gildia_comics_crawler/series_crawler.rb
74
+ - lib/gildia_comics_crawler/version.rb
75
+ homepage: ''
76
+ licenses:
77
+ - MIT
78
+ metadata: {}
79
+ post_install_message:
80
+ rdoc_options: []
81
+ require_paths:
82
+ - lib
83
+ required_ruby_version: !ruby/object:Gem::Requirement
84
+ requirements:
85
+ - - '>='
86
+ - !ruby/object:Gem::Version
87
+ version: '0'
88
+ required_rubygems_version: !ruby/object:Gem::Requirement
89
+ requirements:
90
+ - - '>='
91
+ - !ruby/object:Gem::Version
92
+ version: '0'
93
+ requirements: []
94
+ rubyforge_project:
95
+ rubygems_version: 2.0.14
96
+ signing_key:
97
+ specification_version: 4
98
+ summary: Crawler for downloading comics from komiks.gildia.pl
99
+ test_files: []