scrapey 0.0.19 → 0.0.22

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- ZWMwZWY5N2ExNjliMDVhNmQzNDEyNjJmZjExNjZkNjMyM2VkMGFhZA==
4
+ ZjczYTJlODBiMjJkMjk1ZThiZjZmYTFiOWIxM2RiZDUzNGI2N2Y3Nw==
5
5
  data.tar.gz: !binary |-
6
- YmNjZDFkMTg0MDZjZmZmMDM4ZjRmMzNiYjVjZWE3NTk3Y2YwYWY1Mg==
6
+ ZmI2ZTcwYTcxMDgxZDA4MzU5MTAzMGUwZmM5MmJmZmJiYzZjZWVmYQ==
7
7
  SHA512:
8
8
  metadata.gz: !binary |-
9
- NmEwY2RjZTAxNDBlM2JmNmI0MGExZDg0MGI3OWJkZjlmZTI0YTQ2ZDkyNTI4
10
- ZDEyOGVhNDAxYjIxYWVkOTkyZTUwNDM3ZDY2MzJiZTEwYjZkN2M2ZGExYjNh
11
- NWYwOWJmNzFiMjYwNjAyYzRlNmRiNDA2MjUzNGJlOTNmNWMzNDc=
9
+ YWM2NTBiOTQ2OTU4YTNjY2Q3ZmExODVhZGEzM2E5ODViYWVjM2I3YWY5MDM2
10
+ NjIxOWYwOGQ1Yjk3YjViZDQzYTAwM2I1NTRlZDdmMTM4MGYyMjM1MTVmZTA0
11
+ Zjc1NGI4YzMyYTY1ZTc4NzUyYzAyNGZhNzQ1NWQ2ZTRkM2U3M2I=
12
12
  data.tar.gz: !binary |-
13
- ZjkzYmMzYjJlZTFkNmU0NWVjMTQ3YjBhZGE5NzY5ZTdmN2Q2N2U0NGI1YmEx
14
- NTVmN2MzYTVkN2FjYTA2YWRkNGY1Y2RjNDY5MjE4YjIxOGNjMmVmNGQ4MzA3
15
- MjE2MTRkNzkwMDc3MGYwMjE0M2Y3YmI2M2RhZWUzZDJlMGVhNWY=
13
+ MTA1ZTQ4NTg1NjJkYmZkZmVlYzEzNWQwYmFmNjk2ZWNlMDhjYmIwM2ZmNjM2
14
+ OWY0Njk3YmVmZjVmMjdlYTU0NTM0NjRjZmY3ZjUxYTAxNjk3ZmYzMTNlN2Ji
15
+ MTYxMjE0OGE5MWRkZDUxMTJiYzNlYWQ4NmUxM2EwM2QyNTk2NjA=
@@ -11,14 +11,16 @@ require "scrapey/database"
11
11
  require "scrapey/multi"
12
12
  require "scrapey/tee"
13
13
 
14
- require 'addressable/uri'
15
14
 
16
15
  EMAIL_REGEX = /\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b/i
17
16
 
18
- class URI::Parser
19
- def split url
20
- a = Addressable::URI::parse url
21
- [a.scheme, a.userinfo, a.host, a.port, nil, a.path, nil, a.query, a.fragment]
17
+ unless ENV['USE_URI']
18
+ require 'addressable/uri'
19
+ class URI::Parser
20
+ def split url
21
+ a = Addressable::URI::parse url
22
+ [a.scheme, a.userinfo, a.host, a.port, nil, a.path, nil, a.query, a.fragment]
23
+ end
22
24
  end
23
25
  end
24
26
 
@@ -8,6 +8,12 @@ module Scrapey
8
8
  require 'scrapey/cache/disk'
9
9
  @config['cache_dir'] ||= "#{BASEDIR}/cache"
10
10
  FileUtils.mkdir_p @config['cache_dir']
11
+ ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "a", "b", "c", "d", "e", "f"].each do |l1|
12
+ ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "a", "b", "c", "d", "e", "f"].each do |l2|
13
+ FileUtils.mkdir_p "#{@config['cache_dir']}/#{l1}/#{l2}"
14
+ end
15
+ end
16
+
11
17
  end
12
18
  end
13
19
 
@@ -3,7 +3,7 @@ require 'fileutils'
3
3
  module Scrapey
4
4
 
5
5
  def cache_filename url
6
- @config['cache_dir'] + "/" + Digest::MD5.hexdigest(url) + ".cache"
6
+ @config['cache_dir'] + "/" + Digest::MD5.hexdigest(url).sub(/(.)(.)/, '\1/\2/\1\2') + ".cache"
7
7
  end
8
8
 
9
9
  def is_cached? url
@@ -35,6 +35,8 @@ module Scrapey
35
35
  Mechanize::Page.new URI.parse(url), [], Marshal.load(Zlib::Inflate.inflate(File.open(filename, "rb"){|f| f.read})), nil, @agent
36
36
  rescue Exception => e
37
37
  puts e.message
38
+ # delete_cache url
39
+ # Mechanize::Page.new URI.parse(url), [], '<html></html>', nil, @agent
38
40
  end
39
41
  end
40
42
 
@@ -47,4 +49,4 @@ module Scrapey
47
49
  FileUtils.rm(cache_filename(url)) rescue nil
48
50
  end
49
51
 
50
- end
52
+ end
@@ -1,5 +1,5 @@
1
1
  module Scrapey
2
- VERSION = "0.0.19"
2
+ VERSION = "0.0.22"
3
3
  BASEDIR = File.expand_path(File.dirname($0)).gsub(/\/src$/,'')
4
4
  URL = "https://github.com/monkeysuffrage/scrapey"
5
5
  end
@@ -1,11 +1,3 @@
1
- require 'addressable/uri'
2
-
3
- class URI::Parser
4
- def split url
5
- a = Addressable::URI::parse url
6
- [a.scheme, a.userinfo, a.host, a.port, nil, a.path, nil, a.query, a.fragment]
7
- end
8
- end
9
1
 
10
2
  class Hash
11
3
  def shuffle
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scrapey
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.19
4
+ version: 0.0.22
5
5
  platform: ruby
6
6
  authors:
7
7
  - P Guardiario
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-04-16 00:00:00.000000000 Z
11
+ date: 2017-03-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: mechanize