apollo-crawler 0.1.28 → 0.1.30

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 22e840b1ad9ecabcb011c4b02383c060d325a61f
4
- data.tar.gz: faf47f1ae1b7451262037d7c3e28a83bb05a2678
3
+ metadata.gz: 4f7597400d1ab12d2ab2d1d52330ca29e4f2192f
4
+ data.tar.gz: 297216744ee4ea8a9af6595b1361dd3bfcf8d0b1
5
5
  SHA512:
6
- metadata.gz: f916c2da81657168389ddf998407d338bbdafd323dfb5b7f0e3b4dc9ef2e9d001d00974fee48d2918a12762a90e5172146820553b09d798720f33c319db052a7
7
- data.tar.gz: a820bf980c655156eefcad5487a79805f121f7e59d2e95285415d0229583b3df4440a4c0e8021186fc963837239cd0b1a9c1e8ca18718dc5db67bc54ff6ce9a7
6
+ metadata.gz: 71bf482ad465efa67976547c5e0a99d8a728478c5c346f987e675da8574b45e4dba44375fd501e8699c083e42eaf7e19ac1db6d2d9962f9736d408c52f896844
7
+ data.tar.gz: 01651a1aa4d3408ce63142a46c6e3727ceee35995520cfcc2df4b6451495b09cf5cebe1cc1244dca6ebef2138bb06b6ede97ac1720b75b8de866b3f381c2e25f
@@ -96,10 +96,10 @@ module Apollo
96
96
  url = queued_url["url"]
97
97
 
98
98
  res = Apollo::Model::RawDocument.new
99
- res.headers = doc.headers
100
- res.body = doc.body
101
- res.sha_hash = Digest::SHA1.hexdigest(doc.body)
102
- res.status = doc.status
99
+ res.headers = doc[:headers]
100
+ res.body = doc[:body]
101
+ res.sha_hash = Digest::SHA1.hexdigest(doc[:body])
102
+ res.status = doc[:status]
103
103
  res.url = url
104
104
 
105
105
  return res
@@ -170,10 +170,12 @@ module Apollo
170
170
  end
171
171
 
172
172
  def self.create_metadoc(url, doc)
173
+ body = doc[:body].encode('UTF-8', {:invalid => :replace, :undef => :replace, :replace => '?'})
174
+
173
175
  return {
174
176
  'url' => url,
175
- 'doc' => doc.body.encode('UTF-8', {:invalid => :replace, :undef => :replace, :replace => '?'}),
176
- 'hash' => Digest::SHA256.new.update(doc.body).hexdigest,
177
+ 'doc' => body,
178
+ 'hash' => Digest::SHA256.new.update(body).hexdigest,
177
179
  'created_at' => Time.now.utc,
178
180
  'expires_at' => nil,
179
181
  'version' => 0
@@ -19,9 +19,15 @@
19
19
  # THE SOFTWARE.
20
20
 
21
21
  require 'cgi'
22
- require "net/http"
23
- require "open-uri"
22
+
23
+ require 'open-uri'
24
+ require 'net/http'
25
+
24
26
  require 'faraday'
27
+ require 'faraday_middleware'
28
+
29
+ require 'mechanize'
30
+
25
31
  require 'ipaddr'
26
32
 
27
33
  # require 'resolv'
@@ -38,7 +44,7 @@ module Apollo
38
44
  }
39
45
  end
40
46
 
41
- def self.fetch(url, options = {})
47
+ def self.fetch_old(url, options = {})
42
48
  begin
43
49
  uri = URI.parse(url.to_s)
44
50
  rescue Exception => e
@@ -64,6 +70,29 @@ module Apollo
64
70
  end
65
71
 
66
72
  # Return result
73
+ return res
74
+ end
75
+
76
+ def self.fetch(url, options = {})
77
+ begin
78
+ uri = URI.parse(url.to_s)
79
+ rescue Exception => e
80
+ puts "EXCEPTION: BaseFetcher::fetch() - Unable to fetch: '#{e.to_s}'"
81
+ return nil
82
+ end
83
+
84
+ agent = Mechanize.new do |agent|
85
+ agent.user_agent = 'Apollo Crawler'
86
+ end
87
+
88
+ page = agent.get(uri)
89
+
90
+ res = {
91
+ :status => page.code,
92
+ :headers => page.header.to_hash,
93
+ :body => page.content
94
+ }
95
+
67
96
  return res
68
97
  end
69
98
  end # class BaseFetcher
@@ -90,7 +90,7 @@ module Apollo
90
90
  declarations[:queues]["planner.crawled.queue"].bind(declarations[:exchanges]["planner.crawled"]).subscribe do |delivery_info, metadata, payload|
91
91
  msg = JSON.parse(payload)
92
92
 
93
- puts "Crawled - msg.inspect"
93
+ # puts "Crawled - #{msg.inspect}"
94
94
 
95
95
  request = msg['request']
96
96
  response = msg['response']
@@ -18,6 +18,9 @@
18
18
  # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
19
  # THE SOFTWARE.
20
20
 
21
+ require "net/http"
22
+ require "uri"
23
+
21
24
  require File.join(File.dirname(__FILE__), "../model/models.rb")
22
25
 
23
26
  module Apollo
@@ -29,6 +32,13 @@ module Apollo
29
32
  if queued_url.nil? == false
30
33
  return queued_url
31
34
  end
35
+
36
+ uri = URI.parse(url)
37
+ domain = Apollo::Model::Domain.where(:name => uri.hostname).first
38
+ if(domain.nil?)
39
+ domain = Apollo::Model::Domain.new(:name => uri.hostname)
40
+ domain.save
41
+ end
32
42
 
33
43
  res = Apollo::Model::QueuedUrl.new(:url => url, :state => :queued, :crawler_name => crawler.to_s)
34
44
  res.save
@@ -19,5 +19,5 @@
19
19
  # THE SOFTWARE.
20
20
 
21
21
  module Apollo
22
- VERSION = '0.1.28'
22
+ VERSION = '0.1.30'
23
23
  end # Apollo
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: apollo-crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.28
4
+ version: 0.1.30
5
5
  platform: ruby
6
6
  authors:
7
7
  - Tomas Korcak