apollo-crawler 0.0.46 → 0.0.47

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/bin/apollo-crawler CHANGED
@@ -308,7 +308,10 @@ module Apollo
308
308
  return
309
309
  end
310
310
 
311
- crawlers = ARGV
311
+ crawlers = []
312
+ if(ARGV.length > 0)
313
+ crawlers << ARGV.shift
314
+ end
312
315
 
313
316
  if(@options[:run_all])
314
317
  crawlers = @crawlers.keys
@@ -331,16 +334,18 @@ module Apollo
331
334
  puts "Running '#{crawler}'"
332
335
  end
333
336
 
334
- res = p.new.etl
337
+ res = p.new.etl(ARGV)
335
338
  if(res.nil?)
336
339
  next
337
340
  end
338
341
 
339
- # puts Apollo::Crawler::Formatters::Json.format(res)
340
- # puts Apollo::Crawler::Formatters::Plain.format(res)
341
- # puts Apollo::Crawler::Formatters::Table.format(res)
342
+ if(res.kind_of?(Array) == false)
343
+ res = [res]
344
+ end
342
345
 
343
- puts @formatter.format(res)
346
+ res.each do |tmp|
347
+ puts @formatter.format(tmp)
348
+ end
344
349
  end
345
350
  end
346
351
  end
@@ -4,6 +4,11 @@ require "nokogiri"
4
4
  module Apollo
5
5
  module Crawlers
6
6
  class Crawler
7
+ @backlog = nil
8
+
9
+ def initialize
10
+ @backlog = []
11
+ end
7
12
 
8
13
  # Name of the crawler
9
14
  def name
@@ -20,11 +25,31 @@ module Apollo
20
25
  # - (3) Go to (0) eventually
21
26
  def etl(url=nil)
22
27
  # Look for passed URL use default instead and fail if it is not valid
23
- url = url ? url : self.url
28
+ if(url.empty?)
29
+ url = self.url
30
+ end
31
+
24
32
  if(url.nil?)
25
33
  return nil
26
34
  end
27
35
 
36
+ if(url.kind_of?(Array))
37
+ @backlog.concat(url)
38
+ else
39
+ @backlog << url
40
+ end
41
+
42
+ res = []
43
+ while(@backlog.empty? == false)
44
+ url = @backlog.shift
45
+
46
+ # puts "Processing '#{url}'"
47
+ res << self.process_url(url)
48
+ end
49
+ return res
50
+ end
51
+
52
+ def process_url(url)
28
53
  # Try fetch document
29
54
  doc = self.fetch_document(url)
30
55
  if(doc.nil?)
@@ -48,14 +73,18 @@ module Apollo
48
73
 
49
74
  # Fetch document
50
75
  def fetch_document(url)
51
- ic = Iconv.new("UTF-8//IGNORE", "UTF-8")
76
+ if(url == nil)
77
+ url = self.url
78
+ end
52
79
 
53
- if(self.url.nil?)
80
+ if(url.nil?)
54
81
  return nil
55
82
  end
56
83
 
84
+ raw = open(self.url).read
85
+
57
86
  # TODO: Encapsulate and make more robust => invalid hostname, timeouts and so
58
- doc = Nokogiri::HTML(ic.iconv(open(self.url).read))
87
+ doc = Nokogiri::HTML(raw)
59
88
  return doc
60
89
  end
61
90
 
@@ -1,8 +1,6 @@
1
- require 'iconv'
2
-
3
1
  module Apollo
4
2
  module Crawlers
5
- class CRAWLER_CLASS_NAME < Apollo::Crawler::Crawlers::Crawler
3
+ class CRAWLER_CLASS_NAME < Crawler
6
4
  @@MATCHER_ITEM = "CRAWLER_MATCHER"
7
5
 
8
6
  def name()
@@ -1,10 +1,8 @@
1
- require 'iconv'
2
-
3
1
  require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
4
2
 
5
3
  module Apollo
6
4
  module Crawlers
7
- class Alexa < Apollo::Crawlers::Crawler
5
+ class Alexa < Crawler
8
6
  @@MATCHER_ITEM = "//div[@id = 'alphabetically']/ul/li/a"
9
7
 
10
8
  def name()
@@ -1,10 +1,8 @@
1
- require 'iconv'
2
-
3
1
  require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
4
2
 
5
3
  module Apollo
6
4
  module Crawlers
7
- class Firmy < Apollo::Crawlers::Crawler
5
+ class Firmy < Crawler
8
6
  @@MATCHER_ITEM = "//div[@id = 'alphabetically']/ul/li/a"
9
7
 
10
8
  def name()
@@ -1,10 +1,8 @@
1
- require 'iconv'
2
-
3
1
  require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
4
2
 
5
3
  module Apollo
6
4
  module Crawlers
7
- class Slashdot < Apollo::Crawlers::Crawler
5
+ class Slashdot < Crawler
8
6
  @@MATCHER_ITEM = "//article/header/h2/span/a"
9
7
 
10
8
  def name
@@ -1,10 +1,8 @@
1
- require 'iconv'
2
-
3
1
  require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
4
2
 
5
3
  module Apollo
6
4
  module Crawlers
7
- class StackOverflow < Apollo::Crawlers::Crawler
5
+ class StackOverflow < Crawler
8
6
  @@MATCHER_ITEM = "//div[@class = 'summary']/h3/a"
9
7
 
10
8
  def name
@@ -1,10 +1,8 @@
1
- require 'iconv'
2
-
3
1
  require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
4
2
 
5
3
  module Apollo
6
4
  module Crawlers
7
- class Xkcd < Apollo::Crawlers::Crawler
5
+ class Xkcd < Crawler
8
6
  @@MATCHER_ITEM = "//div[@id = 'comic']/img"
9
7
 
10
8
  def name()
@@ -1,10 +1,8 @@
1
- require 'iconv'
2
-
3
1
  require File.join(File.dirname(__FILE__), '..', '..', 'crawler')
4
2
 
5
3
  module Apollo
6
4
  module Crawlers
7
- class HackerNews < Apollo::Crawlers::Crawler
5
+ class HackerNews < Crawler
8
6
  @@MATCHER_ITEM = "//td[@class = 'title']/a"
9
7
 
10
8
  def name
@@ -1,5 +1,5 @@
1
1
  module Apollo
2
2
  module Crawler
3
- VERSION = '0.0.46'
3
+ VERSION = '0.0.47'
4
4
  end # Crawler
5
5
  end # Apollo
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: apollo-crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.46
4
+ version: 0.0.47
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -107,22 +107,6 @@ dependencies:
107
107
  - - ! '>='
108
108
  - !ruby/object:Gem::Version
109
109
  version: '0'
110
- - !ruby/object:Gem::Dependency
111
- name: iconv
112
- requirement: !ruby/object:Gem::Requirement
113
- none: false
114
- requirements:
115
- - - ! '>='
116
- - !ruby/object:Gem::Version
117
- version: '0'
118
- type: :runtime
119
- prerelease: false
120
- version_requirements: !ruby/object:Gem::Requirement
121
- none: false
122
- requirements:
123
- - - ! '>='
124
- - !ruby/object:Gem::Version
125
- version: '0'
126
110
  - !ruby/object:Gem::Dependency
127
111
  name: json
128
112
  requirement: !ruby/object:Gem::Requirement