apollo-crawler 0.1.28 → 0.1.30

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 22e840b1ad9ecabcb011c4b02383c060d325a61f
4
- data.tar.gz: faf47f1ae1b7451262037d7c3e28a83bb05a2678
3
+ metadata.gz: 4f7597400d1ab12d2ab2d1d52330ca29e4f2192f
4
+ data.tar.gz: 297216744ee4ea8a9af6595b1361dd3bfcf8d0b1
5
5
  SHA512:
6
- metadata.gz: f916c2da81657168389ddf998407d338bbdafd323dfb5b7f0e3b4dc9ef2e9d001d00974fee48d2918a12762a90e5172146820553b09d798720f33c319db052a7
7
- data.tar.gz: a820bf980c655156eefcad5487a79805f121f7e59d2e95285415d0229583b3df4440a4c0e8021186fc963837239cd0b1a9c1e8ca18718dc5db67bc54ff6ce9a7
6
+ metadata.gz: 71bf482ad465efa67976547c5e0a99d8a728478c5c346f987e675da8574b45e4dba44375fd501e8699c083e42eaf7e19ac1db6d2d9962f9736d408c52f896844
7
+ data.tar.gz: 01651a1aa4d3408ce63142a46c6e3727ceee35995520cfcc2df4b6451495b09cf5cebe1cc1244dca6ebef2138bb06b6ede97ac1720b75b8de866b3f381c2e25f
@@ -96,10 +96,10 @@ module Apollo
96
96
  url = queued_url["url"]
97
97
 
98
98
  res = Apollo::Model::RawDocument.new
99
- res.headers = doc.headers
100
- res.body = doc.body
101
- res.sha_hash = Digest::SHA1.hexdigest(doc.body)
102
- res.status = doc.status
99
+ res.headers = doc[:headers]
100
+ res.body = doc[:body]
101
+ res.sha_hash = Digest::SHA1.hexdigest(doc[:body])
102
+ res.status = doc[:status]
103
103
  res.url = url
104
104
 
105
105
  return res
@@ -170,10 +170,12 @@ module Apollo
170
170
  end
171
171
 
172
172
  def self.create_metadoc(url, doc)
173
+ body = doc[:body].encode('UTF-8', {:invalid => :replace, :undef => :replace, :replace => '?'})
174
+
173
175
  return {
174
176
  'url' => url,
175
- 'doc' => doc.body.encode('UTF-8', {:invalid => :replace, :undef => :replace, :replace => '?'}),
176
- 'hash' => Digest::SHA256.new.update(doc.body).hexdigest,
177
+ 'doc' => body,
178
+ 'hash' => Digest::SHA256.new.update(body).hexdigest,
177
179
  'created_at' => Time.now.utc,
178
180
  'expires_at' => nil,
179
181
  'version' => 0
@@ -19,9 +19,15 @@
19
19
  # THE SOFTWARE.
20
20
 
21
21
  require 'cgi'
22
- require "net/http"
23
- require "open-uri"
22
+
23
+ require 'open-uri'
24
+ require 'net/http'
25
+
24
26
  require 'faraday'
27
+ require 'faraday_middleware'
28
+
29
+ require 'mechanize'
30
+
25
31
  require 'ipaddr'
26
32
 
27
33
  # require 'resolv'
@@ -38,7 +44,7 @@ module Apollo
38
44
  }
39
45
  end
40
46
 
41
- def self.fetch(url, options = {})
47
+ def self.fetch_old(url, options = {})
42
48
  begin
43
49
  uri = URI.parse(url.to_s)
44
50
  rescue Exception => e
@@ -64,6 +70,29 @@ module Apollo
64
70
  end
65
71
 
66
72
  # Return result
73
+ return res
74
+ end
75
+
76
+ def self.fetch(url, options = {})
77
+ begin
78
+ uri = URI.parse(url.to_s)
79
+ rescue Exception => e
80
+ puts "EXCEPTION: BaseFetcher::fetch() - Unable to fetch: '#{e.to_s}'"
81
+ return nil
82
+ end
83
+
84
+ agent = Mechanize.new do |agent|
85
+ agent.user_agent = 'Apollo Crawler'
86
+ end
87
+
88
+ page = agent.get(uri)
89
+
90
+ res = {
91
+ :status => page.code,
92
+ :headers => page.header.to_hash,
93
+ :body => page.content
94
+ }
95
+
67
96
  return res
68
97
  end
69
98
  end # class BaseFetcher
@@ -90,7 +90,7 @@ module Apollo
90
90
  declarations[:queues]["planner.crawled.queue"].bind(declarations[:exchanges]["planner.crawled"]).subscribe do |delivery_info, metadata, payload|
91
91
  msg = JSON.parse(payload)
92
92
 
93
- puts "Crawled - msg.inspect"
93
+ # puts "Crawled - #{msg.inspect}"
94
94
 
95
95
  request = msg['request']
96
96
  response = msg['response']
@@ -18,6 +18,9 @@
18
18
  # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
19
  # THE SOFTWARE.
20
20
 
21
+ require "net/http"
22
+ require "uri"
23
+
21
24
  require File.join(File.dirname(__FILE__), "../model/models.rb")
22
25
 
23
26
  module Apollo
@@ -29,6 +32,13 @@ module Apollo
29
32
  if queued_url.nil? == false
30
33
  return queued_url
31
34
  end
35
+
36
+ uri = URI.parse(url)
37
+ domain = Apollo::Model::Domain.where(:name => uri.hostname).first
38
+ if(domain.nil?)
39
+ domain = Apollo::Model::Domain.new(:name => uri.hostname)
40
+ domain.save
41
+ end
32
42
 
33
43
  res = Apollo::Model::QueuedUrl.new(:url => url, :state => :queued, :crawler_name => crawler.to_s)
34
44
  res.save
@@ -19,5 +19,5 @@
19
19
  # THE SOFTWARE.
20
20
 
21
21
  module Apollo
22
- VERSION = '0.1.28'
22
+ VERSION = '0.1.30'
23
23
  end # Apollo
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: apollo-crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.28
4
+ version: 0.1.30
5
5
  platform: ruby
6
6
  authors:
7
7
  - Tomas Korcak