manga-crawler 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in manga_crawler.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Thiago Rocha
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,29 @@
1
+ # MangaCrawler
2
+
3
+ TODO: Write a gem description
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'manga_crawler'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install manga_crawler
18
+
19
+ ## Usage
20
+
21
+ TODO: Write usage instructions here
22
+
23
+ ## Contributing
24
+
25
+ 1. Fork it
26
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
27
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
28
+ 4. Push to the branch (`git push origin my-new-feature`)
29
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,10 @@
1
+ require "bundler/gem_tasks"
2
+ require 'rake/testtask'
3
+
4
+ Rake::TestTask.new(:test) do |test|
5
+ test.libs << 'lib' << 'test'
6
+ test.pattern = 'test/**/*_test.rb'
7
+ test.verbose = true
8
+ end
9
+
10
+ task :default => :test
@@ -0,0 +1,6 @@
1
+ require "manga-crawler/version"
2
+ require "manga-crawler/crawler"
3
+
4
+ module MangaCrawler
5
+ # Your code goes here...
6
+ end
@@ -0,0 +1,106 @@
1
+ require 'nokogiri'
2
+ require 'open-uri'
3
+
4
+ module MangaCrawler
5
+ class Crawler
6
+
7
+ # Returns an array of pairs. The first position contains the
8
+ # manga name and the second position the manga link
9
+ # Params:
10
+ # +index_link+:: string with the url containing the index of all mangas
11
+ # +css_path+:: string of a css path format of the links you want to collect
12
+ # +css_pagination+:: string with the css path to the next page link
13
+ # +html_field+:: simbol of the field that has the link
14
+ def get_mangas index_link, css_path, css_pagination, html_field
15
+
16
+ result = Array.new
17
+
18
+ html_index = Nokogiri::HTML(open(index_link))
19
+
20
+ #find all content that matches with the css_path
21
+ links = html_index.css(css_path)
22
+
23
+ #find all content from the anchor nodes found in last search
24
+ links.each do |anchor|
25
+ result.push([anchor.content, anchor[html_field]])
26
+ end
27
+
28
+ #TODO
29
+ #if has a css_pagination, use recursion
30
+ #example: result += get_mangas next_link, css_path, css_pagination
31
+
32
+ return result
33
+ end
34
+
35
+ # Returns the chapters information of a manga. It uses the same
36
+ # logic of get_mangas.
37
+ # Params:
38
+ # +manga_link+:: string with the url containing the manga_link
39
+ # +css_path+:: string of the css path format of the links you want to collect
40
+ # +css_pagination+:: string with the css path to the next page link
41
+ # +html_field+:: simbol of the field that has the link
42
+ def get_chapters manga_link, css_path, css_pagination, html_field
43
+ #TODO
44
+ #uses the same logic of get_mangas
45
+ return get_mangas manga_link, css_path, css_pagination, html_field
46
+ end
47
+
48
+ # Returns the direct links of all pages from a specific chapter. It uses two
49
+ # methods: 'get_pages_links_from_chapter' and 'get_image_from_page'.
50
+ # Params:
51
+ # +chapter_link+:: string with the chapter
52
+ # +css_pages_path+:: string with the CSS path to the pages links
53
+ # +pages_html_field+:: HTML field with the page link value
54
+ # +css_image_path+:: CSS path to the image
55
+ # +image_html_field+:: HTML field with the direct's image url
56
+ # +url_base+:: Site´s base url
57
+ def get_pages chapter_link, css_pages_path, pages_html_field, css_image_path, image_html_field, url_base
58
+
59
+ result = Array.new
60
+
61
+ pages_links = get_pages_links_from_chapter url_base + chapter_link, css_pages_path, pages_html_field
62
+
63
+ pages_links.each do |page|
64
+ result.push( get_image_from_page url_base + page[1], css_image_path, image_html_field )
65
+ end
66
+
67
+ return result
68
+
69
+ end
70
+
71
+ # Returns all pages HTML links from a chapter
72
+ # Params:
73
+ # +chapter_link+:: Link of the chapter
74
+ # +css_path+:: CSS path to the block with the pages links
75
+ # +html_field+:: HTML field that contains the url
76
+ def get_pages_links_from_chapter chapter_link, css_path, html_field
77
+
78
+ result = Array.new
79
+
80
+ chapter_page = Nokogiri::HTML(open(chapter_link))
81
+
82
+ pages_links = chapter_page.css(css_path)
83
+
84
+ pages_links.each do |option|
85
+ result.push([option.content, option[html_field]])
86
+ end
87
+
88
+ return result
89
+ end
90
+
91
+ # Returns the image's direct url of a manga page
92
+ # Params:
93
+ # +page_link+:: HTML page who contains the image
94
+ # +css_path+:: CSS path to the image
95
+ # +html_field+:: field that contains the url
96
+ def get_image_from_page page_link, css_path, html_field
97
+
98
+ html_image = Nokogiri::HTML(open(page_link))
99
+
100
+ image_link = html_image.at_css(css_path)[html_field]
101
+
102
+ return image_link
103
+ end
104
+ end
105
+ end
106
+
@@ -0,0 +1,3 @@
1
+ module MangaCrawler
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,24 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'manga-crawler/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "manga-crawler"
8
+ spec.version = MangaCrawler::VERSION
9
+ spec.authors = ["Thiago Rocha"]
10
+ spec.email = ["kimobr@gmail.com"]
11
+ spec.description = %q{ A gem that collects mangas from websites}
12
+ spec.summary = %q{ Retrieve basic manga information }
13
+ spec.homepage = ""
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files`.split($/)
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.3"
22
+ spec.add_development_dependency "rake"
23
+ spec.add_development_dependency "nokogiri"
24
+ end
@@ -0,0 +1,57 @@
1
+ require "test_helper"
2
+
3
+ describe MangaCrawler::Crawler do
4
+
5
+ crawler = MangaCrawler::Crawler.new
6
+
7
+ it "must retrieve mangas" do
8
+
9
+ sample_index = File.open("test/samples/index-page.html")
10
+ css_path = "a"
11
+ html_field = :href
12
+
13
+ mangas = crawler.get_mangas sample_index, css_path, nil, html_field
14
+
15
+ mangas.must_equal [ ["Naruto", "/first-manga"],
16
+ ["Bleach", "/second-manga"],
17
+ ["One Piece", "/third-manga"] ]
18
+ end
19
+
20
+ it "must retrieve chapters" do
21
+
22
+ sample_manga_page = File.open("test/samples/manga-page.html")
23
+ css_path = "a"
24
+ html_field = :href
25
+
26
+ chapters = crawler.get_chapters sample_manga_page, css_path, nil, html_field
27
+
28
+ chapters.must_equal [ ["Chapter 1", "/first-manga/1"],
29
+ ["Chapter 2", "/second-manga/2"],
30
+ ["Chapter 3", "/third-manga/3"] ]
31
+ end
32
+
33
+ it "must retrieve a direct image link from a page" do
34
+
35
+ sample_image_page = File.open("test/samples/image-page.html")
36
+ css_path = "#img"
37
+ html_field = :src
38
+
39
+ image = crawler.get_image_from_page sample_image_page, css_path, html_field
40
+
41
+ image.must_equal "image.jpg"
42
+ end
43
+
44
+ it "must retrieve all pages links from a chapter" do
45
+
46
+ sample_image_page = File.open("test/samples/image-page.html")
47
+ css_path = "#pageMenu option"
48
+ html_field = :value
49
+
50
+ pages_links = crawler.get_pages_links_from_chapter sample_image_page, css_path, html_field
51
+
52
+ pages_links.must_equal [ ["1", "/first-manga/1/1"],
53
+ ["2", "/first-manga/1/2"],
54
+ ["3", "/first-manga/1/3"] ]
55
+
56
+ end
57
+ end
@@ -0,0 +1,8 @@
1
+ require "test_helper"
2
+
3
+ describe MangaCrawler do
4
+ it "must have a version" do
5
+ version = MangaCrawler::VERSION
6
+ version.wont_be_nil
7
+ end
8
+ end
@@ -0,0 +1,21 @@
1
+ <html>
2
+ <head>
3
+ <title>A sample manga website</title>
4
+ </head>
5
+ <body>
6
+
7
+ <div id="selectpage">
8
+ <select id="pageMenu" name="pageMenu">
9
+ <option value="/first-manga/1/1" selected="selected">1</option>
10
+ <option value="/first-manga/1/2">2</option>
11
+ <option value="/first-manga/1/3">3</option>
12
+ </select>
13
+ </div>
14
+
15
+ <div id="imgholder">
16
+ <a href="/first-manga/1/2">
17
+ <img id="img" src="image.jpg" alt="An image" name="img"/>
18
+ </a>
19
+ </div>
20
+ </body>
21
+ </html>
Binary file
@@ -0,0 +1,14 @@
1
+ <html>
2
+ <head>
3
+ <title>A sample manga website</title>
4
+ </head>
5
+ <body>
6
+ <div class="simple_div">
7
+ <ul class="simple_div">
8
+ <li><a href="/first-manga">Naruto</a></li>
9
+ <li><a href="/second-manga">Bleach</a></li>
10
+ <li><a href="/third-manga">One Piece</a></li>
11
+ </ul>
12
+ </div>
13
+ </body>
14
+ </html>
@@ -0,0 +1,38 @@
1
+ <html>
2
+ <head>
3
+ <title>A sample manga website</title>
4
+ </head>
5
+ <body>
6
+ <div id="chapterlist">
7
+ <table id="listing">
8
+
9
+ <tr class="table_head">
10
+ <th class="leftgap">Chapter Name</th>
11
+ <th>Date Added</th>
12
+ </tr>
13
+
14
+ <tr>
15
+ <td>
16
+ <div class="chico_manga"></div>
17
+ <a href="/first-manga/1">Chapter 1</a> : First chapter </td>
18
+ <td>09/09/2011</td>
19
+ </tr>
20
+
21
+ <tr>
22
+ <td>
23
+ <div class="chico_manga"></div>
24
+ <a href="/second-manga/2">Chapter 2</a> : Second chapter</td>
25
+ <td>09/09/2011</td>
26
+ </tr>
27
+
28
+ <tr>
29
+ <td>
30
+ <div class="chico_manga"></div>
31
+ <a href="/third-manga/3">Chapter 3</a> : Third chapter</td>
32
+ <td>09/09/2011</td>
33
+ </tr>
34
+
35
+ </table>
36
+ </div>
37
+ </body>
38
+ </html>
@@ -0,0 +1,3 @@
1
+ require 'minitest/autorun'
2
+ require 'minitest/pride'
3
+ require File.expand_path('../../lib/manga-crawler.rb', __FILE__)
metadata ADDED
@@ -0,0 +1,117 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: manga-crawler
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Thiago Rocha
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-04-16 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: bundler
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ~>
20
+ - !ruby/object:Gem::Version
21
+ version: '1.3'
22
+ type: :development
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ~>
28
+ - !ruby/object:Gem::Version
29
+ version: '1.3'
30
+ - !ruby/object:Gem::Dependency
31
+ name: rake
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :development
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: nokogiri
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :development
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ description: ! ' A gem that collects mangas from websites'
63
+ email:
64
+ - kimobr@gmail.com
65
+ executables: []
66
+ extensions: []
67
+ extra_rdoc_files: []
68
+ files:
69
+ - .gitignore
70
+ - Gemfile
71
+ - LICENSE.txt
72
+ - README.md
73
+ - Rakefile
74
+ - lib/manga-crawler.rb
75
+ - lib/manga-crawler/crawler.rb
76
+ - lib/manga-crawler/version.rb
77
+ - manga-crawler.gemspec
78
+ - test/lib/manga-crawler/crawler_test.rb
79
+ - test/lib/manga-crawler/version_test.rb
80
+ - test/samples/image-page.html
81
+ - test/samples/image.jpg
82
+ - test/samples/index-page.html
83
+ - test/samples/manga-page.html
84
+ - test/test_helper.rb
85
+ homepage: ''
86
+ licenses:
87
+ - MIT
88
+ post_install_message:
89
+ rdoc_options: []
90
+ require_paths:
91
+ - lib
92
+ required_ruby_version: !ruby/object:Gem::Requirement
93
+ none: false
94
+ requirements:
95
+ - - ! '>='
96
+ - !ruby/object:Gem::Version
97
+ version: '0'
98
+ required_rubygems_version: !ruby/object:Gem::Requirement
99
+ none: false
100
+ requirements:
101
+ - - ! '>='
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ requirements: []
105
+ rubyforge_project:
106
+ rubygems_version: 1.8.24
107
+ signing_key:
108
+ specification_version: 3
109
+ summary: Retrieve basic manga information
110
+ test_files:
111
+ - test/lib/manga-crawler/crawler_test.rb
112
+ - test/lib/manga-crawler/version_test.rb
113
+ - test/samples/image-page.html
114
+ - test/samples/image.jpg
115
+ - test/samples/index-page.html
116
+ - test/samples/manga-page.html
117
+ - test/test_helper.rb