apollo-crawler 0.1.12 → 0.1.13

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- MjkxYjlmY2NjMDYwODcxN2JmMzA1MDM3NzM5NzQ1ZWVhNDNiYWQ0MQ==
4
+ OTQxNjRiOTJiNzc5MWZkZTg1NmE0YjA2OTE3YjQ3NmM5MjBhMTA2NQ==
5
5
  data.tar.gz: !binary |-
6
- N2ViMTdhNmQ1OGM3ZTczYjIxZWU1Y2NlY2NlYWMxMDM1MDkwZjBjYg==
6
+ NDM0OWUwMzM3OGNkNzk5YWMwNjMwYzFlNzUwNDc2MzFlZDZhMzJiMA==
7
7
  !binary "U0hBNTEy":
8
8
  metadata.gz: !binary |-
9
- NmNiNjMzZGFjY2ZmOTNjOTU4NTAxMGQzODZlNGYyOWI2MGQzY2YwM2Q4ZmMw
10
- NDMzYzM2OTNkYmU2MzJiYzNhYzMwNGNmZDI0OWZiM2ZiZjJiYjFkMWExY2Rh
11
- NTA3MDkxNTA1OTk1NWE5ZWMyNGFiZjY5ODhiMDMxZDU5NjgwZDU=
9
+ YjY4ZTNmMmIxZmUwMmE1OWMxNzVmZTQyYmQ2NTQ0MDExM2IzM2Q4MmVlYmEz
10
+ MmJiNzAxMDAzNGM4ODIwODViYjk4YTU5OWY2NGU1ODU2MmYxMTI0NzdiMDg5
11
+ NTI5OWQ3MjkzZTRjZDAzNWYxZTc5YmNmZGM3OTc3OGFlNDNkZjM=
12
12
  data.tar.gz: !binary |-
13
- MWI1YWI3NjcwYzQ1NWVkZDI2ZjM3NjY5MGRjMzZkMDQxMThlMWE2MGU3MzAx
14
- ZWE5ZWIyYWExNjdjMjYyYjUxNTU5MWZlNGI5MWUzOWYwZGI2NjQ2YTNkMTIy
15
- YmRiNzIzNGY5ZThlNTdkMzIwODJkNjc2ZWUyNzQ5MWNlOWZlM2I=
13
+ M2NjZTJmYzFkNGFkNzQzOWY0ZTRhMjBmMThkYTIxYWFkNGJjYmFhZDgyMTQ4
14
+ ZWNlM2M1NGU2M2E3MjAwZjcyYmVmNmM0OTVmODQxMmY1ZmM5YWEyZDAwZWYy
15
+ YzMyYWMyNTBhYjg0YWRiZWNlMjcxNjQzOTMyMDlhYWU5ZjBjYjc=
@@ -39,6 +39,7 @@ require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/hacker_news_cr
39
39
  require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/slashdot_crawler')
40
40
  require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/stackoverflow_crawler')
41
41
  require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/xkcd_crawler')
42
+ require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/youjizz_crawler')
42
43
 
43
44
  # Fetchers
44
45
  require File.join(File.dirname(__FILE__), 'apollo_crawler/fetcher/base_fetcher')
@@ -45,7 +45,7 @@ module RbConfig
45
45
  # CACHE_CLASS = Apollo::Cache::NullCache
46
46
 
47
47
  # Used caching mechanism by default
48
- CACHE_CLASS = Apollo::Cache::MemoryCache
48
+ CACHE_CLASS = Apollo::Cache::MongoCache
49
49
 
50
50
 
51
51
 
@@ -160,7 +160,7 @@ module Apollo
160
160
  def self.create_metadoc(url, doc)
161
161
  return {
162
162
  'url' => url,
163
- 'doc' => doc.encode('utf-8'),
163
+ 'doc' => doc.encode('UTF-8', {:invalid => :replace, :undef => :replace, :replace => '?'}),
164
164
  'hash' => Digest::SHA256.new.update(doc).hexdigest,
165
165
  'created_at' => Time.now.utc,
166
166
  'expires_at' => nil,
@@ -0,0 +1,59 @@
1
+ # Copyright, 2013, by Tomas Korcak. <korczis@gmail.com>
2
+ #
3
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ # of this software and associated documentation files (the "Software"), to deal
5
+ # in the Software without restriction, including without limitation the rights
6
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ # copies of the Software, and to permit persons to whom the Software is
8
+ # furnished to do so, subject to the following conditions:
9
+ #
10
+ # The above copyright notice and this permission notice shall be included in
11
+ # all copies or substantial portions of the Software.
12
+ #
13
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
+ # THE SOFTWARE.
20
+
21
+ module Apollo
22
+ module Crawler
23
+ class YoujizzCrawler < BaseCrawler
24
+ @@MATCHER_ITEM = "//span[@id = 'miniatura']/span/a"
25
+
26
+ def name()
27
+ return "Youjizz"
28
+ end
29
+
30
+ def url()
31
+ return "http://www.youjizz.com/"
32
+ end
33
+
34
+ def extract_data(doc)
35
+ res = doc.xpath(@@MATCHER_ITEM).map { | node |
36
+ link = BaseCrawler.try_get_url(self.url, node['href'])
37
+ next if link.nil?
38
+
39
+ {
40
+ :text => link,
41
+ :link => link
42
+ }
43
+ }
44
+ end
45
+
46
+ def extract_links(doc)
47
+ res = doc.xpath("//div[@id = 'pagination']/a").map { | node |
48
+ link = BaseCrawler.try_get_url(self.url, node['href'])
49
+ next if link.nil?
50
+
51
+ {
52
+ :link => link
53
+ }
54
+ }
55
+ end
56
+ end
57
+ end # Crawler
58
+ end # Apollo
59
+
@@ -26,7 +26,7 @@ require File.join(File.dirname(__FILE__), 'base_fetcher')
26
26
  module Apollo
27
27
  module Fetcher
28
28
  class SmartFetcher < BaseFetcher
29
- @@DEFAULT_SLEEP = 3.0
29
+ @@DEFAULT_SLEEP = 1.0
30
30
  @@LAST_FETCH = nil
31
31
 
32
32
  def self.fetch(url)
@@ -36,6 +36,7 @@ require File.join(File.dirname(__FILE__), 'crawler/hacker_news_crawler')
36
36
  require File.join(File.dirname(__FILE__), 'crawler/slashdot_crawler')
37
37
  require File.join(File.dirname(__FILE__), 'crawler/stackoverflow_crawler')
38
38
  require File.join(File.dirname(__FILE__), 'crawler/xkcd_crawler')
39
+ require File.join(File.dirname(__FILE__), 'crawler/youjizz_crawler')
39
40
 
40
41
  # Fetchers
41
42
  require File.join(File.dirname(__FILE__), 'fetcher/base_fetcher')
@@ -43,6 +43,10 @@ require File.join(File.dirname(__FILE__), 'version')
43
43
 
44
44
  module Apollo
45
45
  class CrawlerProgram
46
+ @@PROGRAM_DIR = File.expand_path("~/.apollo-crawler")
47
+ @@CONFIG_PATH = File.join(@@PROGRAM_DIR, "config.rb")
48
+
49
+ # Load default config
46
50
  require File.join(File.dirname(__FILE__), "config")
47
51
 
48
52
  # This hash will hold all of the options
@@ -130,7 +134,11 @@ module Apollo
130
134
 
131
135
  opts.on(nil, '--list-formatters', 'List of formatters available') do
132
136
  @options[:list_formatters] = true
133
- end
137
+ end
138
+
139
+ opts.on('-s', '--silent', 'Silent mode - do not print processed document') do
140
+ @options[:silent] = true
141
+ end
134
142
  end
135
143
  end
136
144
 
@@ -197,13 +205,13 @@ module Apollo
197
205
  # Load global options first
198
206
  # Merge it with local options (if they exists)
199
207
  def load_config_file()
200
- config = File.join(File.dirname(__FILE__), "config", "crawler.rb")
208
+ config = @@CONFIG_PATH
209
+
201
210
  if(File.exists?(config))
202
211
  if(@options[:verbose])
203
- RbConfig::DEFAULT_LOGGER "Loading config '#{config}'"
212
+ puts "Loading config '#{config}'"
204
213
  end
205
214
 
206
- # puts "Let's require '#{@options[:verbose]}'"
207
215
  require config
208
216
  else
209
217
  if(@options[:verbose])
@@ -429,8 +437,10 @@ module Apollo
429
437
  docs = [docs]
430
438
  end
431
439
 
432
- docs.each do |doc|
433
- puts @formatter.format(doc)
440
+ if @options[:silent] != true
441
+ docs.each do |doc|
442
+ puts @formatter.format(doc)
443
+ end
434
444
  end
435
445
  }
436
446
  end
@@ -452,12 +462,28 @@ module Apollo
452
462
  return crawlers
453
463
  end
454
464
 
465
+ def init_program_directory()
466
+ dir = File.expand_path("~/.apollo-crawler")
467
+ if(File.directory?(dir) == false)
468
+ FileUtils.mkpath(dir)
469
+ end
470
+
471
+ config_path = File.join(File.dirname(__FILE__), 'config_user.trb')
472
+ dest_path = File.join(dir, 'config.rb')
473
+
474
+ if(File.exists?(config_path) && File.exists?(dest_path) == false)
475
+ FileUtils.cp(config_path, dest_path)
476
+ end
477
+ end
478
+
455
479
  # Init program
456
480
  def init_program(args)
457
481
  init_options()
458
482
 
459
483
  parse_options(args)
460
484
 
485
+ init_program_directory()
486
+
461
487
  load_config_file()
462
488
 
463
489
  register_modules()
@@ -19,5 +19,5 @@
19
19
  # THE SOFTWARE.
20
20
 
21
21
  module Apollo
22
- VERSION = '0.1.12'
22
+ VERSION = '0.1.13'
23
23
  end # Apollo
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: apollo-crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.12
4
+ version: 0.1.13
5
5
  platform: ruby
6
6
  authors:
7
7
  - Tomas Korcak
@@ -328,6 +328,7 @@ files:
328
328
  - ./lib/apollo_crawler/cache/memcached_cache.rb
329
329
  - ./lib/apollo_crawler/crawler/xkcd_crawler.rb
330
330
  - ./lib/apollo_crawler/crawler/google_crawler.rb
331
+ - ./lib/apollo_crawler/crawler/youjizz_crawler.rb
331
332
  - ./lib/apollo_crawler/crawler/slashdot_crawler.rb
332
333
  - ./lib/apollo_crawler/crawler/hacker_news_crawler.rb
333
334
  - ./lib/apollo_crawler/crawler/base_crawler.rb