apollo-crawler 0.0.46 → 0.0.47

Sign up to get free protection for your applications and to get access to all the features.
data/bin/apollo-crawler CHANGED
@@ -308,7 +308,10 @@ module Apollo
308
308
  return
309
309
  end
310
310
 
311
- crawlers = ARGV
311
+ crawlers = []
312
+ if(ARGV.length > 0)
313
+ crawlers << ARGV.shift
314
+ end
312
315
 
313
316
  if(@options[:run_all])
314
317
  crawlers = @crawlers.keys
@@ -331,16 +334,18 @@ module Apollo
331
334
  puts "Running '#{crawler}'"
332
335
  end
333
336
 
334
- res = p.new.etl
337
+ res = p.new.etl(ARGV)
335
338
  if(res.nil?)
336
339
  next
337
340
  end
338
341
 
339
- # puts Apollo::Crawler::Formatters::Json.format(res)
340
- # puts Apollo::Crawler::Formatters::Plain.format(res)
341
- # puts Apollo::Crawler::Formatters::Table.format(res)
342
+ if(res.kind_of?(Array) == false)
343
+ res = [res]
344
+ end
342
345
 
343
- puts @formatter.format(res)
346
+ res.each do |tmp|
347
+ puts @formatter.format(tmp)
348
+ end
344
349
  end
345
350
  end
346
351
  end
@@ -4,6 +4,11 @@ require "nokogiri"
4
4
  module Apollo
5
5
  module Crawlers
6
6
  class Crawler
7
+ @backlog = nil
8
+
9
+ def initialize
10
+ @backlog = []
11
+ end
7
12
 
8
13
  # Name of the crawler
9
14
  def name
@@ -20,11 +25,31 @@ module Apollo
20
25
  # - (3) Go to (0) eventually
21
26
  def etl(url=nil)
22
27
  # Look for passed URL use default instead and fail if it is not valid
23
- url = url ? url : self.url
28
+ if(url.empty?)
29
+ url = self.url
30
+ end
31
+
24
32
  if(url.nil?)
25
33
  return nil
26
34
  end
27
35
 
36
+ if(url.kind_of?(Array))
37
+ @backlog.concat(url)
38
+ else
39
+ @backlog << url
40
+ end
41
+
42
+ res = []
43
+ while(@backlog.empty? == false)
44
+ url = @backlog.shift
45
+
46
+ # puts "Processing '#{url}'"
47
+ res << self.process_url(url)
48
+ end
49
+ return res
50
+ end
51
+
52
+ def process_url(url)
28
53
  # Try fetch document
29
54
  doc = self.fetch_document(url)
30
55
  if(doc.nil?)
@@ -48,14 +73,18 @@ module Apollo
48
73
 
49
74
  # Fetch document
50
75
  def fetch_document(url)
51
- ic = Iconv.new("UTF-8//IGNORE", "UTF-8")
76
+ if(url == nil)
77
+ url = self.url
78
+ end
52
79
 
53
- if(self.url.nil?)
80
+ if(url.nil?)
54
81
  return nil
55
82
  end
56
83
 
84
+ raw = open(self.url).read
85
+
57
86
  # TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
58
- doc = Nokogiri::HTML(ic.iconv(open(self.url).read))
87
+ doc = Nokogiri::HTML(raw)
59
88
  return doc
60
89
  end
61
90
 
@@ -1,8 +1,6 @@
1
- require 'iconv'
2
-
3
1
  module Apollo
4
2
  module Crawlers
5
- class CRAWLER_CLASS_NAME < Apollo::Crawler::Crawlers::Crawler
3
+ class CRAWLER_CLASS_NAME < Crawler
6
4
  @@MATCHER_ITEM = "CRAWLER_MATCHER"
7
5
 
8
6
  def name()
@@ -1,10 +1,8 @@
1
- require 'iconv'
2
-
3
1
  require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
4
2
 
5
3
  module Apollo
6
4
  module Crawlers
7
- class Alexa < Apollo::Crawlers::Crawler
5
+ class Alexa < Crawler
8
6
  @@MATCHER_ITEM = "//div[@id = 'alphabetically']/ul/li/a"
9
7
 
10
8
  def name()
@@ -1,10 +1,8 @@
1
- require 'iconv'
2
-
3
1
  require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
4
2
 
5
3
  module Apollo
6
4
  module Crawlers
7
- class Firmy < Apollo::Crawlers::Crawler
5
+ class Firmy < Crawler
8
6
  @@MATCHER_ITEM = "//div[@id = 'alphabetically']/ul/li/a"
9
7
 
10
8
  def name()
@@ -1,10 +1,8 @@
1
- require 'iconv'
2
-
3
1
  require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
4
2
 
5
3
  module Apollo
6
4
  module Crawlers
7
- class Slashdot < Apollo::Crawlers::Crawler
5
+ class Slashdot < Crawler
8
6
  @@MATCHER_ITEM = "//article/header/h2/span/a"
9
7
 
10
8
  def name
@@ -1,10 +1,8 @@
1
- require 'iconv'
2
-
3
1
  require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
4
2
 
5
3
  module Apollo
6
4
  module Crawlers
7
- class StackOverflow < Apollo::Crawlers::Crawler
5
+ class StackOverflow < Crawler
8
6
  @@MATCHER_ITEM = "//div[@class = 'summary']/h3/a"
9
7
 
10
8
  def name
@@ -1,10 +1,8 @@
1
- require 'iconv'
2
-
3
1
  require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
4
2
 
5
3
  module Apollo
6
4
  module Crawlers
7
- class Xkcd < Apollo::Crawlers::Crawler
5
+ class Xkcd < Crawler
8
6
  @@MATCHER_ITEM = "//div[@id = 'comic']/img"
9
7
 
10
8
  def name()
@@ -1,10 +1,8 @@
1
- require 'iconv'
2
-
3
1
  require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
4
2
 
5
3
  module Apollo
6
4
  module Crawlers
7
- class HackerNews < Apollo::Crawlers::Crawler
5
+ class HackerNews < Crawler
8
6
  @@MATCHER_ITEM = "//td[@class = 'title']/a"
9
7
 
10
8
  def name
@@ -1,5 +1,5 @@
1
1
  module Apollo
2
2
  module Crawler
3
- VERSION = '0.0.46'
3
+ VERSION = '0.0.47'
4
4
  end # Crawler
5
5
  end # Apollo
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: apollo-crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.46
4
+ version: 0.0.47
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -107,22 +107,6 @@ dependencies:
107
107
  - - ! '>='
108
108
  - !ruby/object:Gem::Version
109
109
  version: '0'
110
- - !ruby/object:Gem::Dependency
111
- name: iconv
112
- requirement: !ruby/object:Gem::Requirement
113
- none: false
114
- requirements:
115
- - - ! '>='
116
- - !ruby/object:Gem::Version
117
- version: '0'
118
- type: :runtime
119
- prerelease: false
120
- version_requirements: !ruby/object:Gem::Requirement
121
- none: false
122
- requirements:
123
- - - ! '>='
124
- - !ruby/object:Gem::Version
125
- version: '0'
126
110
  - !ruby/object:Gem::Dependency
127
111
  name: json
128
112
  requirement: !ruby/object:Gem::Requirement