apollo-crawler 0.1.12 → 0.1.13
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +8 -8
- data/lib/apollo_crawler.rb +1 -0
- data/lib/apollo_crawler/config.rb +1 -1
- data/lib/apollo_crawler/crawler/base_crawler.rb +1 -1
- data/lib/apollo_crawler/crawler/youjizz_crawler.rb +59 -0
- data/lib/apollo_crawler/fetcher/smart_fetcher.rb +1 -1
- data/lib/apollo_crawler/lib.rb +1 -0
- data/lib/apollo_crawler/program.rb +32 -6
- data/lib/apollo_crawler/version.rb +1 -1
- metadata +2 -1
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
OTQxNjRiOTJiNzc5MWZkZTg1NmE0YjA2OTE3YjQ3NmM5MjBhMTA2NQ==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
NDM0OWUwMzM3OGNkNzk5YWMwNjMwYzFlNzUwNDc2MzFlZDZhMzJiMA==
|
7
7
|
!binary "U0hBNTEy":
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
YjY4ZTNmMmIxZmUwMmE1OWMxNzVmZTQyYmQ2NTQ0MDExM2IzM2Q4MmVlYmEz
|
10
|
+
MmJiNzAxMDAzNGM4ODIwODViYjk4YTU5OWY2NGU1ODU2MmYxMTI0NzdiMDg5
|
11
|
+
NTI5OWQ3MjkzZTRjZDAzNWYxZTc5YmNmZGM3OTc3OGFlNDNkZjM=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
M2NjZTJmYzFkNGFkNzQzOWY0ZTRhMjBmMThkYTIxYWFkNGJjYmFhZDgyMTQ4
|
14
|
+
ZWNlM2M1NGU2M2E3MjAwZjcyYmVmNmM0OTVmODQxMmY1ZmM5YWEyZDAwZWYy
|
15
|
+
YzMyYWMyNTBhYjg0YWRiZWNlMjcxNjQzOTMyMDlhYWU5ZjBjYjc=
|
data/lib/apollo_crawler.rb
CHANGED
@@ -39,6 +39,7 @@ require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/hacker_news_cr
|
|
39
39
|
require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/slashdot_crawler')
|
40
40
|
require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/stackoverflow_crawler')
|
41
41
|
require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/xkcd_crawler')
|
42
|
+
require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/youjizz_crawler')
|
42
43
|
|
43
44
|
# Fetchers
|
44
45
|
require File.join(File.dirname(__FILE__), 'apollo_crawler/fetcher/base_fetcher')
|
@@ -160,7 +160,7 @@ module Apollo
|
|
160
160
|
def self.create_metadoc(url, doc)
|
161
161
|
return {
|
162
162
|
'url' => url,
|
163
|
-
'doc' => doc.encode('
|
163
|
+
'doc' => doc.encode('UTF-8', {:invalid => :replace, :undef => :replace, :replace => '?'}),
|
164
164
|
'hash' => Digest::SHA256.new.update(doc).hexdigest,
|
165
165
|
'created_at' => Time.now.utc,
|
166
166
|
'expires_at' => nil,
|
@@ -0,0 +1,59 @@
|
|
1
|
+
# Copyright, 2013, by Tomas Korcak. <korczis@gmail.com>
|
2
|
+
#
|
3
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
# of this software and associated documentation files (the "Software"), to deal
|
5
|
+
# in the Software without restriction, including without limitation the rights
|
6
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7
|
+
# copies of the Software, and to permit persons to whom the Software is
|
8
|
+
# furnished to do so, subject to the following conditions:
|
9
|
+
#
|
10
|
+
# The above copyright notice and this permission notice shall be included in
|
11
|
+
# all copies or substantial portions of the Software.
|
12
|
+
#
|
13
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
|
+
# THE SOFTWARE.
|
20
|
+
|
21
|
+
module Apollo
|
22
|
+
module Crawler
|
23
|
+
class YoujizzCrawler < BaseCrawler
|
24
|
+
@@MATCHER_ITEM = "//span[@id = 'miniatura']/span/a"
|
25
|
+
|
26
|
+
def name()
|
27
|
+
return "Youjizz"
|
28
|
+
end
|
29
|
+
|
30
|
+
def url()
|
31
|
+
return "http://www.youjizz.com/"
|
32
|
+
end
|
33
|
+
|
34
|
+
def extract_data(doc)
|
35
|
+
res = doc.xpath(@@MATCHER_ITEM).map { | node |
|
36
|
+
link = BaseCrawler.try_get_url(self.url, node['href'])
|
37
|
+
next if link.nil?
|
38
|
+
|
39
|
+
{
|
40
|
+
:text => link,
|
41
|
+
:link => link
|
42
|
+
}
|
43
|
+
}
|
44
|
+
end
|
45
|
+
|
46
|
+
def extract_links(doc)
|
47
|
+
res = doc.xpath("//div[@id = 'pagination']/a").map { | node |
|
48
|
+
link = BaseCrawler.try_get_url(self.url, node['href'])
|
49
|
+
next if link.nil?
|
50
|
+
|
51
|
+
{
|
52
|
+
:link => link
|
53
|
+
}
|
54
|
+
}
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end # Crawler
|
58
|
+
end # Apollo
|
59
|
+
|
data/lib/apollo_crawler/lib.rb
CHANGED
@@ -36,6 +36,7 @@ require File.join(File.dirname(__FILE__), 'crawler/hacker_news_crawler')
|
|
36
36
|
require File.join(File.dirname(__FILE__), 'crawler/slashdot_crawler')
|
37
37
|
require File.join(File.dirname(__FILE__), 'crawler/stackoverflow_crawler')
|
38
38
|
require File.join(File.dirname(__FILE__), 'crawler/xkcd_crawler')
|
39
|
+
require File.join(File.dirname(__FILE__), 'crawler/youjizz_crawler')
|
39
40
|
|
40
41
|
# Fetchers
|
41
42
|
require File.join(File.dirname(__FILE__), 'fetcher/base_fetcher')
|
@@ -43,6 +43,10 @@ require File.join(File.dirname(__FILE__), 'version')
|
|
43
43
|
|
44
44
|
module Apollo
|
45
45
|
class CrawlerProgram
|
46
|
+
@@PROGRAM_DIR = File.expand_path("~/.apollo-crawler")
|
47
|
+
@@CONFIG_PATH = File.join(@@PROGRAM_DIR, "config.rb")
|
48
|
+
|
49
|
+
# Load default config
|
46
50
|
require File.join(File.dirname(__FILE__), "config")
|
47
51
|
|
48
52
|
# This hash will hold all of the options
|
@@ -130,7 +134,11 @@ module Apollo
|
|
130
134
|
|
131
135
|
opts.on(nil, '--list-formatters', 'List of formatters available') do
|
132
136
|
@options[:list_formatters] = true
|
133
|
-
end
|
137
|
+
end
|
138
|
+
|
139
|
+
opts.on('-s', '--silent', 'Silent mode - do not print processed document') do
|
140
|
+
@options[:silent] = true
|
141
|
+
end
|
134
142
|
end
|
135
143
|
end
|
136
144
|
|
@@ -197,13 +205,13 @@ module Apollo
|
|
197
205
|
# Load global options first
|
198
206
|
# Merge it with local options (if they exists)
|
199
207
|
def load_config_file()
|
200
|
-
config =
|
208
|
+
config = @@CONFIG_PATH
|
209
|
+
|
201
210
|
if(File.exists?(config))
|
202
211
|
if(@options[:verbose])
|
203
|
-
|
212
|
+
puts "Loading config '#{config}'"
|
204
213
|
end
|
205
214
|
|
206
|
-
# puts "Let's require '#{@options[:verbose]}'"
|
207
215
|
require config
|
208
216
|
else
|
209
217
|
if(@options[:verbose])
|
@@ -429,8 +437,10 @@ module Apollo
|
|
429
437
|
docs = [docs]
|
430
438
|
end
|
431
439
|
|
432
|
-
|
433
|
-
|
440
|
+
if @options[:silent] != true
|
441
|
+
docs.each do |doc|
|
442
|
+
puts @formatter.format(doc)
|
443
|
+
end
|
434
444
|
end
|
435
445
|
}
|
436
446
|
end
|
@@ -452,12 +462,28 @@ module Apollo
|
|
452
462
|
return crawlers
|
453
463
|
end
|
454
464
|
|
465
|
+
def init_program_directory()
|
466
|
+
dir = File.expand_path("~/.apollo-crawler")
|
467
|
+
if(File.directory?(dir) == false)
|
468
|
+
FileUtils.mkpath(dir)
|
469
|
+
end
|
470
|
+
|
471
|
+
config_path = File.join(File.dirname(__FILE__), 'config_user.trb')
|
472
|
+
dest_path = File.join(dir, 'config.rb')
|
473
|
+
|
474
|
+
if(File.exists?(config_path) && File.exists?(dest_path) == false)
|
475
|
+
FileUtils.cp(config_path, dest_path)
|
476
|
+
end
|
477
|
+
end
|
478
|
+
|
455
479
|
# Init program
|
456
480
|
def init_program(args)
|
457
481
|
init_options()
|
458
482
|
|
459
483
|
parse_options(args)
|
460
484
|
|
485
|
+
init_program_directory()
|
486
|
+
|
461
487
|
load_config_file()
|
462
488
|
|
463
489
|
register_modules()
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: apollo-crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.13
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Tomas Korcak
|
@@ -328,6 +328,7 @@ files:
|
|
328
328
|
- ./lib/apollo_crawler/cache/memcached_cache.rb
|
329
329
|
- ./lib/apollo_crawler/crawler/xkcd_crawler.rb
|
330
330
|
- ./lib/apollo_crawler/crawler/google_crawler.rb
|
331
|
+
- ./lib/apollo_crawler/crawler/youjizz_crawler.rb
|
331
332
|
- ./lib/apollo_crawler/crawler/slashdot_crawler.rb
|
332
333
|
- ./lib/apollo_crawler/crawler/hacker_news_crawler.rb
|
333
334
|
- ./lib/apollo_crawler/crawler/base_crawler.rb
|