apollo-crawler 0.1.12 → 0.1.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +8 -8
- data/lib/apollo_crawler.rb +1 -0
- data/lib/apollo_crawler/config.rb +1 -1
- data/lib/apollo_crawler/crawler/base_crawler.rb +1 -1
- data/lib/apollo_crawler/crawler/youjizz_crawler.rb +59 -0
- data/lib/apollo_crawler/fetcher/smart_fetcher.rb +1 -1
- data/lib/apollo_crawler/lib.rb +1 -0
- data/lib/apollo_crawler/program.rb +32 -6
- data/lib/apollo_crawler/version.rb +1 -1
- metadata +2 -1
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
OTQxNjRiOTJiNzc5MWZkZTg1NmE0YjA2OTE3YjQ3NmM5MjBhMTA2NQ==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
NDM0OWUwMzM3OGNkNzk5YWMwNjMwYzFlNzUwNDc2MzFlZDZhMzJiMA==
|
7
7
|
!binary "U0hBNTEy":
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
YjY4ZTNmMmIxZmUwMmE1OWMxNzVmZTQyYmQ2NTQ0MDExM2IzM2Q4MmVlYmEz
|
10
|
+
MmJiNzAxMDAzNGM4ODIwODViYjk4YTU5OWY2NGU1ODU2MmYxMTI0NzdiMDg5
|
11
|
+
NTI5OWQ3MjkzZTRjZDAzNWYxZTc5YmNmZGM3OTc3OGFlNDNkZjM=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
M2NjZTJmYzFkNGFkNzQzOWY0ZTRhMjBmMThkYTIxYWFkNGJjYmFhZDgyMTQ4
|
14
|
+
ZWNlM2M1NGU2M2E3MjAwZjcyYmVmNmM0OTVmODQxMmY1ZmM5YWEyZDAwZWYy
|
15
|
+
YzMyYWMyNTBhYjg0YWRiZWNlMjcxNjQzOTMyMDlhYWU5ZjBjYjc=
|
data/lib/apollo_crawler.rb
CHANGED
@@ -39,6 +39,7 @@ require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/hacker_news_cr
|
|
39
39
|
require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/slashdot_crawler')
|
40
40
|
require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/stackoverflow_crawler')
|
41
41
|
require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/xkcd_crawler')
|
42
|
+
require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/youjizz_crawler')
|
42
43
|
|
43
44
|
# Fetchers
|
44
45
|
require File.join(File.dirname(__FILE__), 'apollo_crawler/fetcher/base_fetcher')
|
@@ -160,7 +160,7 @@ module Apollo
|
|
160
160
|
def self.create_metadoc(url, doc)
|
161
161
|
return {
|
162
162
|
'url' => url,
|
163
|
-
'doc' => doc.encode('
|
163
|
+
'doc' => doc.encode('UTF-8', {:invalid => :replace, :undef => :replace, :replace => '?'}),
|
164
164
|
'hash' => Digest::SHA256.new.update(doc).hexdigest,
|
165
165
|
'created_at' => Time.now.utc,
|
166
166
|
'expires_at' => nil,
|
@@ -0,0 +1,59 @@
|
|
1
|
+
# Copyright, 2013, by Tomas Korcak. <korczis@gmail.com>
|
2
|
+
#
|
3
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
# of this software and associated documentation files (the "Software"), to deal
|
5
|
+
# in the Software without restriction, including without limitation the rights
|
6
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7
|
+
# copies of the Software, and to permit persons to whom the Software is
|
8
|
+
# furnished to do so, subject to the following conditions:
|
9
|
+
#
|
10
|
+
# The above copyright notice and this permission notice shall be included in
|
11
|
+
# all copies or substantial portions of the Software.
|
12
|
+
#
|
13
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
|
+
# THE SOFTWARE.
|
20
|
+
|
21
|
+
module Apollo
|
22
|
+
module Crawler
|
23
|
+
class YoujizzCrawler < BaseCrawler
|
24
|
+
@@MATCHER_ITEM = "//span[@id = 'miniatura']/span/a"
|
25
|
+
|
26
|
+
def name()
|
27
|
+
return "Youjizz"
|
28
|
+
end
|
29
|
+
|
30
|
+
def url()
|
31
|
+
return "http://www.youjizz.com/"
|
32
|
+
end
|
33
|
+
|
34
|
+
def extract_data(doc)
|
35
|
+
res = doc.xpath(@@MATCHER_ITEM).map { | node |
|
36
|
+
link = BaseCrawler.try_get_url(self.url, node['href'])
|
37
|
+
next if link.nil?
|
38
|
+
|
39
|
+
{
|
40
|
+
:text => link,
|
41
|
+
:link => link
|
42
|
+
}
|
43
|
+
}
|
44
|
+
end
|
45
|
+
|
46
|
+
def extract_links(doc)
|
47
|
+
res = doc.xpath("//div[@id = 'pagination']/a").map { | node |
|
48
|
+
link = BaseCrawler.try_get_url(self.url, node['href'])
|
49
|
+
next if link.nil?
|
50
|
+
|
51
|
+
{
|
52
|
+
:link => link
|
53
|
+
}
|
54
|
+
}
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end # Crawler
|
58
|
+
end # Apollo
|
59
|
+
|
data/lib/apollo_crawler/lib.rb
CHANGED
@@ -36,6 +36,7 @@ require File.join(File.dirname(__FILE__), 'crawler/hacker_news_crawler')
|
|
36
36
|
require File.join(File.dirname(__FILE__), 'crawler/slashdot_crawler')
|
37
37
|
require File.join(File.dirname(__FILE__), 'crawler/stackoverflow_crawler')
|
38
38
|
require File.join(File.dirname(__FILE__), 'crawler/xkcd_crawler')
|
39
|
+
require File.join(File.dirname(__FILE__), 'crawler/youjizz_crawler')
|
39
40
|
|
40
41
|
# Fetchers
|
41
42
|
require File.join(File.dirname(__FILE__), 'fetcher/base_fetcher')
|
@@ -43,6 +43,10 @@ require File.join(File.dirname(__FILE__), 'version')
|
|
43
43
|
|
44
44
|
module Apollo
|
45
45
|
class CrawlerProgram
|
46
|
+
@@PROGRAM_DIR = File.expand_path("~/.apollo-crawler")
|
47
|
+
@@CONFIG_PATH = File.join(@@PROGRAM_DIR, "config.rb")
|
48
|
+
|
49
|
+
# Load default config
|
46
50
|
require File.join(File.dirname(__FILE__), "config")
|
47
51
|
|
48
52
|
# This hash will hold all of the options
|
@@ -130,7 +134,11 @@ module Apollo
|
|
130
134
|
|
131
135
|
opts.on(nil, '--list-formatters', 'List of formatters available') do
|
132
136
|
@options[:list_formatters] = true
|
133
|
-
end
|
137
|
+
end
|
138
|
+
|
139
|
+
opts.on('-s', '--silent', 'Silent mode - do not print processed document') do
|
140
|
+
@options[:silent] = true
|
141
|
+
end
|
134
142
|
end
|
135
143
|
end
|
136
144
|
|
@@ -197,13 +205,13 @@ module Apollo
|
|
197
205
|
# Load global options first
|
198
206
|
# Merge it with local options (if they exists)
|
199
207
|
def load_config_file()
|
200
|
-
config =
|
208
|
+
config = @@CONFIG_PATH
|
209
|
+
|
201
210
|
if(File.exists?(config))
|
202
211
|
if(@options[:verbose])
|
203
|
-
|
212
|
+
puts "Loading config '#{config}'"
|
204
213
|
end
|
205
214
|
|
206
|
-
# puts "Let's require '#{@options[:verbose]}'"
|
207
215
|
require config
|
208
216
|
else
|
209
217
|
if(@options[:verbose])
|
@@ -429,8 +437,10 @@ module Apollo
|
|
429
437
|
docs = [docs]
|
430
438
|
end
|
431
439
|
|
432
|
-
|
433
|
-
|
440
|
+
if @options[:silent] != true
|
441
|
+
docs.each do |doc|
|
442
|
+
puts @formatter.format(doc)
|
443
|
+
end
|
434
444
|
end
|
435
445
|
}
|
436
446
|
end
|
@@ -452,12 +462,28 @@ module Apollo
|
|
452
462
|
return crawlers
|
453
463
|
end
|
454
464
|
|
465
|
+
def init_program_directory()
|
466
|
+
dir = File.expand_path("~/.apollo-crawler")
|
467
|
+
if(File.directory?(dir) == false)
|
468
|
+
FileUtils.mkpath(dir)
|
469
|
+
end
|
470
|
+
|
471
|
+
config_path = File.join(File.dirname(__FILE__), 'config_user.trb')
|
472
|
+
dest_path = File.join(dir, 'config.rb')
|
473
|
+
|
474
|
+
if(File.exists?(config_path) && File.exists?(dest_path) == false)
|
475
|
+
FileUtils.cp(config_path, dest_path)
|
476
|
+
end
|
477
|
+
end
|
478
|
+
|
455
479
|
# Init program
|
456
480
|
def init_program(args)
|
457
481
|
init_options()
|
458
482
|
|
459
483
|
parse_options(args)
|
460
484
|
|
485
|
+
init_program_directory()
|
486
|
+
|
461
487
|
load_config_file()
|
462
488
|
|
463
489
|
register_modules()
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: apollo-crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.13
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Tomas Korcak
|
@@ -328,6 +328,7 @@ files:
|
|
328
328
|
- ./lib/apollo_crawler/cache/memcached_cache.rb
|
329
329
|
- ./lib/apollo_crawler/crawler/xkcd_crawler.rb
|
330
330
|
- ./lib/apollo_crawler/crawler/google_crawler.rb
|
331
|
+
- ./lib/apollo_crawler/crawler/youjizz_crawler.rb
|
331
332
|
- ./lib/apollo_crawler/crawler/slashdot_crawler.rb
|
332
333
|
- ./lib/apollo_crawler/crawler/hacker_news_crawler.rb
|
333
334
|
- ./lib/apollo_crawler/crawler/base_crawler.rb
|