scrapey 0.0.19 → 0.0.22

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- ZWMwZWY5N2ExNjliMDVhNmQzNDEyNjJmZjExNjZkNjMyM2VkMGFhZA==
4
+ ZjczYTJlODBiMjJkMjk1ZThiZjZmYTFiOWIxM2RiZDUzNGI2N2Y3Nw==
5
5
  data.tar.gz: !binary |-
6
- YmNjZDFkMTg0MDZjZmZmMDM4ZjRmMzNiYjVjZWE3NTk3Y2YwYWY1Mg==
6
+ ZmI2ZTcwYTcxMDgxZDA4MzU5MTAzMGUwZmM5MmJmZmJiYzZjZWVmYQ==
7
7
  SHA512:
8
8
  metadata.gz: !binary |-
9
- NmEwY2RjZTAxNDBlM2JmNmI0MGExZDg0MGI3OWJkZjlmZTI0YTQ2ZDkyNTI4
10
- ZDEyOGVhNDAxYjIxYWVkOTkyZTUwNDM3ZDY2MzJiZTEwYjZkN2M2ZGExYjNh
11
- NWYwOWJmNzFiMjYwNjAyYzRlNmRiNDA2MjUzNGJlOTNmNWMzNDc=
9
+ YWM2NTBiOTQ2OTU4YTNjY2Q3ZmExODVhZGEzM2E5ODViYWVjM2I3YWY5MDM2
10
+ NjIxOWYwOGQ1Yjk3YjViZDQzYTAwM2I1NTRlZDdmMTM4MGYyMjM1MTVmZTA0
11
+ Zjc1NGI4YzMyYTY1ZTc4NzUyYzAyNGZhNzQ1NWQ2ZTRkM2U3M2I=
12
12
  data.tar.gz: !binary |-
13
- ZjkzYmMzYjJlZTFkNmU0NWVjMTQ3YjBhZGE5NzY5ZTdmN2Q2N2U0NGI1YmEx
14
- NTVmN2MzYTVkN2FjYTA2YWRkNGY1Y2RjNDY5MjE4YjIxOGNjMmVmNGQ4MzA3
15
- MjE2MTRkNzkwMDc3MGYwMjE0M2Y3YmI2M2RhZWUzZDJlMGVhNWY=
13
+ MTA1ZTQ4NTg1NjJkYmZkZmVlYzEzNWQwYmFmNjk2ZWNlMDhjYmIwM2ZmNjM2
14
+ OWY0Njk3YmVmZjVmMjdlYTU0NTM0NjRjZmY3ZjUxYTAxNjk3ZmYzMTNlN2Ji
15
+ MTYxMjE0OGE5MWRkZDUxMTJiYzNlYWQ4NmUxM2EwM2QyNTk2NjA=
@@ -11,14 +11,16 @@ require "scrapey/database"
11
11
  require "scrapey/multi"
12
12
  require "scrapey/tee"
13
13
 
14
- require 'addressable/uri'
15
14
 
16
15
  EMAIL_REGEX = /\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b/i
17
16
 
18
- class URI::Parser
19
- def split url
20
- a = Addressable::URI::parse url
21
- [a.scheme, a.userinfo, a.host, a.port, nil, a.path, nil, a.query, a.fragment]
17
+ unless ENV['USE_URI']
18
+ require 'addressable/uri'
19
+ class URI::Parser
20
+ def split url
21
+ a = Addressable::URI::parse url
22
+ [a.scheme, a.userinfo, a.host, a.port, nil, a.path, nil, a.query, a.fragment]
23
+ end
22
24
  end
23
25
  end
24
26
 
@@ -8,6 +8,12 @@ module Scrapey
8
8
  require 'scrapey/cache/disk'
9
9
  @config['cache_dir'] ||= "#{BASEDIR}/cache"
10
10
  FileUtils.mkdir_p @config['cache_dir']
11
+ ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "a", "b", "c", "d", "e", "f"].each do |l1|
12
+ ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "a", "b", "c", "d", "e", "f"].each do |l2|
13
+ FileUtils.mkdir_p "#{@config['cache_dir']}/#{l1}/#{l2}"
14
+ end
15
+ end
16
+
11
17
  end
12
18
  end
13
19
 
@@ -3,7 +3,7 @@ require 'fileutils'
3
3
  module Scrapey
4
4
 
5
5
  def cache_filename url
6
- @config['cache_dir'] + "/" + Digest::MD5.hexdigest(url) + ".cache"
6
+ @config['cache_dir'] + "/" + Digest::MD5.hexdigest(url).sub(/(.)(.)/, '\1/\2/\1\2') + ".cache"
7
7
  end
8
8
 
9
9
  def is_cached? url
@@ -35,6 +35,8 @@ module Scrapey
35
35
  Mechanize::Page.new URI.parse(url), [], Marshal.load(Zlib::Inflate.inflate(File.open(filename, "rb"){|f| f.read})), nil, @agent
36
36
  rescue Exception => e
37
37
  puts e.message
38
+ # delete_cache url
39
+ # Mechanize::Page.new URI.parse(url), [], '<html></html>', nil, @agent
38
40
  end
39
41
  end
40
42
 
@@ -47,4 +49,4 @@ module Scrapey
47
49
  FileUtils.rm(cache_filename(url)) rescue nil
48
50
  end
49
51
 
50
- end
52
+ end
@@ -1,5 +1,5 @@
1
1
  module Scrapey
2
- VERSION = "0.0.19"
2
+ VERSION = "0.0.22"
3
3
  BASEDIR = File.expand_path(File.dirname($0)).gsub(/\/src$/,'')
4
4
  URL = "https://github.com/monkeysuffrage/scrapey"
5
5
  end
@@ -1,11 +1,3 @@
1
- require 'addressable/uri'
2
-
3
- class URI::Parser
4
- def split url
5
- a = Addressable::URI::parse url
6
- [a.scheme, a.userinfo, a.host, a.port, nil, a.path, nil, a.query, a.fragment]
7
- end
8
- end
9
1
 
10
2
  class Hash
11
3
  def shuffle
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scrapey
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.19
4
+ version: 0.0.22
5
5
  platform: ruby
6
6
  authors:
7
7
  - P Guardiario
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-04-16 00:00:00.000000000 Z
11
+ date: 2017-03-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: mechanize