apollo-crawler 0.1.12 → 0.1.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- MjkxYjlmY2NjMDYwODcxN2JmMzA1MDM3NzM5NzQ1ZWVhNDNiYWQ0MQ==
4
+ OTQxNjRiOTJiNzc5MWZkZTg1NmE0YjA2OTE3YjQ3NmM5MjBhMTA2NQ==
5
5
  data.tar.gz: !binary |-
6
- N2ViMTdhNmQ1OGM3ZTczYjIxZWU1Y2NlY2NlYWMxMDM1MDkwZjBjYg==
6
+ NDM0OWUwMzM3OGNkNzk5YWMwNjMwYzFlNzUwNDc2MzFlZDZhMzJiMA==
7
7
  !binary "U0hBNTEy":
8
8
  metadata.gz: !binary |-
9
- NmNiNjMzZGFjY2ZmOTNjOTU4NTAxMGQzODZlNGYyOWI2MGQzY2YwM2Q4ZmMw
10
- NDMzYzM2OTNkYmU2MzJiYzNhYzMwNGNmZDI0OWZiM2ZiZjJiYjFkMWExY2Rh
11
- NTA3MDkxNTA1OTk1NWE5ZWMyNGFiZjY5ODhiMDMxZDU5NjgwZDU=
9
+ YjY4ZTNmMmIxZmUwMmE1OWMxNzVmZTQyYmQ2NTQ0MDExM2IzM2Q4MmVlYmEz
10
+ MmJiNzAxMDAzNGM4ODIwODViYjk4YTU5OWY2NGU1ODU2MmYxMTI0NzdiMDg5
11
+ NTI5OWQ3MjkzZTRjZDAzNWYxZTc5YmNmZGM3OTc3OGFlNDNkZjM=
12
12
  data.tar.gz: !binary |-
13
- MWI1YWI3NjcwYzQ1NWVkZDI2ZjM3NjY5MGRjMzZkMDQxMThlMWE2MGU3MzAx
14
- ZWE5ZWIyYWExNjdjMjYyYjUxNTU5MWZlNGI5MWUzOWYwZGI2NjQ2YTNkMTIy
15
- YmRiNzIzNGY5ZThlNTdkMzIwODJkNjc2ZWUyNzQ5MWNlOWZlM2I=
13
+ M2NjZTJmYzFkNGFkNzQzOWY0ZTRhMjBmMThkYTIxYWFkNGJjYmFhZDgyMTQ4
14
+ ZWNlM2M1NGU2M2E3MjAwZjcyYmVmNmM0OTVmODQxMmY1ZmM5YWEyZDAwZWYy
15
+ YzMyYWMyNTBhYjg0YWRiZWNlMjcxNjQzOTMyMDlhYWU5ZjBjYjc=
@@ -39,6 +39,7 @@ require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/hacker_news_cr
39
39
  require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/slashdot_crawler')
40
40
  require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/stackoverflow_crawler')
41
41
  require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/xkcd_crawler')
42
+ require File.join(File.dirname(__FILE__), 'apollo_crawler/crawler/youjizz_crawler')
42
43
 
43
44
  # Fetchers
44
45
  require File.join(File.dirname(__FILE__), 'apollo_crawler/fetcher/base_fetcher')
@@ -45,7 +45,7 @@ module RbConfig
45
45
  # CACHE_CLASS = Apollo::Cache::NullCache
46
46
 
47
47
  # Used caching mechanism by default
48
- CACHE_CLASS = Apollo::Cache::MemoryCache
48
+ CACHE_CLASS = Apollo::Cache::MongoCache
49
49
 
50
50
 
51
51
 
@@ -160,7 +160,7 @@ module Apollo
160
160
  def self.create_metadoc(url, doc)
161
161
  return {
162
162
  'url' => url,
163
- 'doc' => doc.encode('utf-8'),
163
+ 'doc' => doc.encode('UTF-8', {:invalid => :replace, :undef => :replace, :replace => '?'}),
164
164
  'hash' => Digest::SHA256.new.update(doc).hexdigest,
165
165
  'created_at' => Time.now.utc,
166
166
  'expires_at' => nil,
@@ -0,0 +1,59 @@
1
+ # Copyright, 2013, by Tomas Korcak. <korczis@gmail.com>
2
+ #
3
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ # of this software and associated documentation files (the "Software"), to deal
5
+ # in the Software without restriction, including without limitation the rights
6
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ # copies of the Software, and to permit persons to whom the Software is
8
+ # furnished to do so, subject to the following conditions:
9
+ #
10
+ # The above copyright notice and this permission notice shall be included in
11
+ # all copies or substantial portions of the Software.
12
+ #
13
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
+ # THE SOFTWARE.
20
+
21
+ module Apollo
22
+ module Crawler
23
+ class YoujizzCrawler < BaseCrawler
24
+ @@MATCHER_ITEM = "//span[@id = 'miniatura']/span/a"
25
+
26
+ def name()
27
+ return "Youjizz"
28
+ end
29
+
30
+ def url()
31
+ return "http://www.youjizz.com/"
32
+ end
33
+
34
+ def extract_data(doc)
35
+ res = doc.xpath(@@MATCHER_ITEM).map { | node |
36
+ link = BaseCrawler.try_get_url(self.url, node['href'])
37
+ next if link.nil?
38
+
39
+ {
40
+ :text => link,
41
+ :link => link
42
+ }
43
+ }
44
+ end
45
+
46
+ def extract_links(doc)
47
+ res = doc.xpath("//div[@id = 'pagination']/a").map { | node |
48
+ link = BaseCrawler.try_get_url(self.url, node['href'])
49
+ next if link.nil?
50
+
51
+ {
52
+ :link => link
53
+ }
54
+ }
55
+ end
56
+ end
57
+ end # Crawler
58
+ end # Apollo
59
+
@@ -26,7 +26,7 @@ require File.join(File.dirname(__FILE__), 'base_fetcher')
26
26
  module Apollo
27
27
  module Fetcher
28
28
  class SmartFetcher < BaseFetcher
29
- @@DEFAULT_SLEEP = 3.0
29
+ @@DEFAULT_SLEEP = 1.0
30
30
  @@LAST_FETCH = nil
31
31
 
32
32
  def self.fetch(url)
@@ -36,6 +36,7 @@ require File.join(File.dirname(__FILE__), 'crawler/hacker_news_crawler')
36
36
  require File.join(File.dirname(__FILE__), 'crawler/slashdot_crawler')
37
37
  require File.join(File.dirname(__FILE__), 'crawler/stackoverflow_crawler')
38
38
  require File.join(File.dirname(__FILE__), 'crawler/xkcd_crawler')
39
+ require File.join(File.dirname(__FILE__), 'crawler/youjizz_crawler')
39
40
 
40
41
  # Fetchers
41
42
  require File.join(File.dirname(__FILE__), 'fetcher/base_fetcher')
@@ -43,6 +43,10 @@ require File.join(File.dirname(__FILE__), 'version')
43
43
 
44
44
  module Apollo
45
45
  class CrawlerProgram
46
+ @@PROGRAM_DIR = File.expand_path("~/.apollo-crawler")
47
+ @@CONFIG_PATH = File.join(@@PROGRAM_DIR, "config.rb")
48
+
49
+ # Load default config
46
50
  require File.join(File.dirname(__FILE__), "config")
47
51
 
48
52
  # This hash will hold all of the options
@@ -130,7 +134,11 @@ module Apollo
130
134
 
131
135
  opts.on(nil, '--list-formatters', 'List of formatters available') do
132
136
  @options[:list_formatters] = true
133
- end
137
+ end
138
+
139
+ opts.on('-s', '--silent', 'Silent mode - do not print processed document') do
140
+ @options[:silent] = true
141
+ end
134
142
  end
135
143
  end
136
144
 
@@ -197,13 +205,13 @@ module Apollo
197
205
  # Load global options first
198
206
  # Merge it with local options (if they exists)
199
207
  def load_config_file()
200
- config = File.join(File.dirname(__FILE__), "config", "crawler.rb")
208
+ config = @@CONFIG_PATH
209
+
201
210
  if(File.exists?(config))
202
211
  if(@options[:verbose])
203
- RbConfig::DEFAULT_LOGGER "Loading config '#{config}'"
212
+ puts "Loading config '#{config}'"
204
213
  end
205
214
 
206
- # puts "Let's require '#{@options[:verbose]}'"
207
215
  require config
208
216
  else
209
217
  if(@options[:verbose])
@@ -429,8 +437,10 @@ module Apollo
429
437
  docs = [docs]
430
438
  end
431
439
 
432
- docs.each do |doc|
433
- puts @formatter.format(doc)
440
+ if @options[:silent] != true
441
+ docs.each do |doc|
442
+ puts @formatter.format(doc)
443
+ end
434
444
  end
435
445
  }
436
446
  end
@@ -452,12 +462,28 @@ module Apollo
452
462
  return crawlers
453
463
  end
454
464
 
465
+ def init_program_directory()
466
+ dir = File.expand_path("~/.apollo-crawler")
467
+ if(File.directory?(dir) == false)
468
+ FileUtils.mkpath(dir)
469
+ end
470
+
471
+ config_path = File.join(File.dirname(__FILE__), 'config_user.trb')
472
+ dest_path = File.join(dir, 'config.rb')
473
+
474
+ if(File.exists?(config_path) && File.exists?(dest_path) == false)
475
+ FileUtils.cp(config_path, dest_path)
476
+ end
477
+ end
478
+
455
479
  # Init program
456
480
  def init_program(args)
457
481
  init_options()
458
482
 
459
483
  parse_options(args)
460
484
 
485
+ init_program_directory()
486
+
461
487
  load_config_file()
462
488
 
463
489
  register_modules()
@@ -19,5 +19,5 @@
19
19
  # THE SOFTWARE.
20
20
 
21
21
  module Apollo
22
- VERSION = '0.1.12'
22
+ VERSION = '0.1.13'
23
23
  end # Apollo
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: apollo-crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.12
4
+ version: 0.1.13
5
5
  platform: ruby
6
6
  authors:
7
7
  - Tomas Korcak
@@ -328,6 +328,7 @@ files:
328
328
  - ./lib/apollo_crawler/cache/memcached_cache.rb
329
329
  - ./lib/apollo_crawler/crawler/xkcd_crawler.rb
330
330
  - ./lib/apollo_crawler/crawler/google_crawler.rb
331
+ - ./lib/apollo_crawler/crawler/youjizz_crawler.rb
331
332
  - ./lib/apollo_crawler/crawler/slashdot_crawler.rb
332
333
  - ./lib/apollo_crawler/crawler/hacker_news_crawler.rb
333
334
  - ./lib/apollo_crawler/crawler/base_crawler.rb