url_reader 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 90d8ea16e1c937474887f35170886a5e3e283c5e
4
+ data.tar.gz: eec8a64973d2c4ecba39f266cf6a168a6e21145b
5
+ SHA512:
6
+ metadata.gz: 97e812e7b7fa2ef8f004b86149ef9e28de861daa8190063de2ef0e2aec6e60b55a4fbcd9bd2688ae7fb25c62fe2b69037e7ac521c2042205180d773ce2a1be50
7
+ data.tar.gz: b7b3e385fdb44e1b58cbf71348556accc336d5ca3e0eb2ea1ef254b1d85cde697e2b1e31431ad3c47588334b6a846c6dfb779f032551892b8f2899117c11417a
data/.gitignore ADDED
@@ -0,0 +1,2 @@
1
+ /*.gem
2
+ /tmp
data/.ruby-version ADDED
@@ -0,0 +1 @@
1
+ 2.1.2
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gemspec
data/Gemfile.lock ADDED
@@ -0,0 +1,55 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ url_reader (0.1.0)
5
+ activesupport (>= 4)
6
+ rest-client
7
+
8
+ GEM
9
+ remote: https://rubygems.org/
10
+ specs:
11
+ activesupport (4.1.8)
12
+ i18n (~> 0.6, >= 0.6.9)
13
+ json (~> 1.7, >= 1.7.7)
14
+ minitest (~> 5.1)
15
+ thread_safe (~> 0.1)
16
+ tzinfo (~> 1.1)
17
+ addressable (2.3.6)
18
+ crack (0.4.2)
19
+ safe_yaml (~> 1.0.0)
20
+ diff-lcs (1.2.5)
21
+ i18n (0.6.11)
22
+ json (1.8.1)
23
+ mime-types (2.4.3)
24
+ minitest (5.4.3)
25
+ netrc (0.8.0)
26
+ rest-client (1.7.2)
27
+ mime-types (>= 1.16, < 3.0)
28
+ netrc (~> 0.7)
29
+ rspec (3.1.0)
30
+ rspec-core (~> 3.1.0)
31
+ rspec-expectations (~> 3.1.0)
32
+ rspec-mocks (~> 3.1.0)
33
+ rspec-core (3.1.7)
34
+ rspec-support (~> 3.1.0)
35
+ rspec-expectations (3.1.2)
36
+ diff-lcs (>= 1.2.0, < 2.0)
37
+ rspec-support (~> 3.1.0)
38
+ rspec-mocks (3.1.3)
39
+ rspec-support (~> 3.1.0)
40
+ rspec-support (3.1.2)
41
+ safe_yaml (1.0.4)
42
+ thread_safe (0.3.4)
43
+ tzinfo (1.2.2)
44
+ thread_safe (~> 0.1)
45
+ webmock (1.20.3)
46
+ addressable (>= 2.3.6)
47
+ crack (>= 0.3.2)
48
+
49
+ PLATFORMS
50
+ ruby
51
+
52
+ DEPENDENCIES
53
+ rspec
54
+ url_reader!
55
+ webmock
data/README.md ADDED
File without changes
data/lib/initialize.rb ADDED
@@ -0,0 +1,7 @@
1
+ require 'rubygems'
2
+ require 'bundler/setup'
3
+
4
+ require 'active_support/all'
5
+ require 'rest_client'
6
+
7
+ ActiveSupport::Dependencies.autoload_paths << File.expand_path('../../lib', __FILE__)
@@ -0,0 +1,18 @@
1
+ class UrlReader::BaseError < StandardError
2
+ def self.msg(error, additional_msg = nil)
3
+ msg = "#{error.class.name}: #{error.message}"
4
+ if additional_msg
5
+ msg += ", #{additional_msg}"
6
+ end
7
+ msg
8
+ end
9
+
10
+ def initialize(inner_or_msg = nil, additional_msg = nil)
11
+ if inner_or_msg.is_a?(String)
12
+ super(inner_or_msg)
13
+ else
14
+ super(self.class.msg(inner_or_msg, additional_msg))
15
+ set_backtrace(inner_or_msg.backtrace)
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,2 @@
1
+ class UrlReader::CannotResolveEncodingError < UrlReader::BaseError
2
+ end
@@ -0,0 +1,48 @@
1
+ class Util::UrlReader::FileCache
2
+ def initialize(cache_dir_path)
3
+ @cache_dir_path = cache_dir_path
4
+ @cache = {}
5
+ end
6
+
7
+ def read_entry(key)
8
+ unless @cache.has_key?(key)
9
+ ekey = encoded_key(key)
10
+ hash = hash(ekey)
11
+ file_path = File.join(@cache_dir_path, hash)
12
+ value = nil
13
+ if File.exist?(file_path)
14
+ value = (decoded_value(File.open(file_path).read.strip.split("\n")
15
+ .select { |x| x.start_with?("#{ekey}\t") }[0].split("\t", 2)[1]) rescue nil)
16
+ end
17
+ @cache[key] = value
18
+ end
19
+ @cache[key]
20
+ end
21
+
22
+ def write_entry(key, value)
23
+ @cache[key] = value
24
+ ekey = encoded_key(key)
25
+ hash = hash(ekey)
26
+ file_path = File.join(@cache_dir_path, hash)
27
+ File.open(file_path, 'a') { |f| f.puts("#{ekey}\t#{encoded_value(value)}") }
28
+ true
29
+ end
30
+
31
+ private
32
+
33
+ def encoded_key(key)
34
+ URI.encode_www_form_component(key)
35
+ end
36
+
37
+ def encoded_value(value)
38
+ CGI.escape(value)
39
+ end
40
+
41
+ def decoded_value(value)
42
+ CGI.unescape(value)
43
+ end
44
+
45
+ def hash(key)
46
+ Digest::SHA256.hexdigest(key)
47
+ end
48
+ end
@@ -0,0 +1,20 @@
1
+ class UrlReader::ReadError < UrlReader::BaseError
2
+ PageNotFound = 0
3
+ RequestTimeout = 1
4
+ UnidentifiedError = 2
5
+
6
+ attr_reader :type
7
+
8
+ def initialize(*args)
9
+ super(*args)
10
+ inner = args[0]
11
+ @type =
12
+ if inner.is_a?(RestClient::ResourceNotFound)
13
+ PageNotFound
14
+ elsif inner.is_a?(RestClient::RequestTimeout)
15
+ RequestTimeout
16
+ else
17
+ UnidentifiedError
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,7 @@
1
+ module UrlReader::UrlFixer
2
+ extend ActiveSupport::Concern
3
+
4
+ def fixed_url(url)
5
+ url.gsub(/[^[:ascii:]]| /) { |c| URI.encode(c) }.gsub('[', '%5B').gsub(']', '%5D')
6
+ end
7
+ end
data/lib/url_reader.rb ADDED
@@ -0,0 +1,108 @@
1
+ require_relative 'initialize'
2
+
3
+ require 'kconv'
4
+
5
+ module UrlReader
6
+ extend ActiveSupport::Concern
7
+
8
+ include UrlFixer
9
+
10
+ REQUEST_TIMEOUT = 10
11
+ REQUEST_OPEN_TIMEOUT = 10
12
+
13
+ def read_url(url, options = {})
14
+ if defined?(Rails) && Rails.env.development?
15
+ if ENV['READ_URL_CACHE_NOT_USE'] != 'true'
16
+ ckey = cache_key(url, options)
17
+ res = cache.read_entry(ckey)
18
+ res || read_url_core_with_cache_write(url, options, ckey)
19
+ else
20
+ read_url_core_with_cache_write(url, options)
21
+ end
22
+ else
23
+ read_url_core(url, options)
24
+ end
25
+ end
26
+
27
+ private
28
+
29
+ def cache
30
+ @cache ||= begin
31
+ %x(mkdir -p #{Rails.root}/tmp/cache/url_reader)
32
+ FileCache.new(File.join(Rails.root, 'tmp/cache/url_reader'))
33
+ end
34
+ end
35
+
36
+ def cache_key(url, options)
37
+ "#{url}?#{options.to_s}"
38
+ end
39
+
40
+ def read_url_core_with_cache_write(url, options, ckey = nil)
41
+ ckey ||= cache_key(url, options)
42
+ res = read_url_core(url, options)
43
+ return nil if res.nil?
44
+ cache.write_entry(ckey, res)
45
+ res
46
+ end
47
+
48
+ def read_url_core(url, options)
49
+ valid_url = fixed_url(url)
50
+ headers = {}
51
+ headers[:user_agent] = options[:user_agent] if options[:user_agent]
52
+ hash = {
53
+ url: valid_url,
54
+ timeout: options[:request_timeout] || REQUEST_TIMEOUT,
55
+ open_timeout: options[:request_open_timeout] || REQUEST_OPEN_TIMEOUT,
56
+ headers: headers
57
+ }
58
+ response =
59
+ begin
60
+ if options[:method] == :post
61
+ RestClient::Request.execute(hash.merge(method: :post, payload: options[:params]))
62
+ else
63
+ RestClient::Request.execute(hash.merge(method: :get))
64
+ end
65
+ rescue RestClient::ResourceNotFound,
66
+ RestClient::InternalServerError,
67
+ RestClient::RequestTimeout,
68
+ RestClient::ServerBrokeConnection,
69
+ Errno::ECONNREFUSED,
70
+ Errno::ECONNRESET => e
71
+ ne = ReadError.new(e, "Read #{hash[:url]} failed")
72
+ if options[:ignore_not_found]
73
+ options[:ignore_read_errors] ||= []
74
+ options[:ignore_read_errors] << 'PageNotFound'
75
+ end
76
+ if ignore_errors = options[:ignore_read_errors]
77
+ return nil if ignore_errors.map { |x| x.is_a?(Integer) ? x : ReadError.const_get(x) }.include?(ne.type)
78
+ end
79
+ raise ne
80
+ end
81
+ return nil unless response
82
+ return resolve_encoding(response) if response.headers[:content_type] !~ /^image\//
83
+ response.to_str
84
+ end
85
+
86
+ def resolve_encoding(response)
87
+ response_str = response.to_str
88
+ encoding = response_encoding(response.headers, response_str)
89
+ begin
90
+ return response_str.encode(Encoding::UTF_8, encoding)
91
+ rescue Encoding::UndefinedConversionError => e
92
+ return response_str.encode(Encoding::UTF_8, Encoding::CP932) if encoding == Encoding::Shift_JIS
93
+ return response_str.encode(Encoding::UTF_8, Encoding::CP51932) if encoding == Encoding::EUC_JP
94
+ raise CannotResolveEncodingError, e
95
+ end
96
+ end
97
+
98
+ def response_encoding(response_headers, response_str)
99
+ response_str_utf8 = response_str.toutf8
100
+ [response_headers[:content_type].try(:match, /charset=(?<charset>[^;]+)($|;)/),
101
+ response_str_utf8.match(/<meta .*?content="[^"]*?charset=(?<charset>[^;"]+)/),
102
+ response_str_utf8.match(/<meta .*?charset="(?<charset>[^"]+)"/)]
103
+ .map { |x| x.try(:[], 'charset') }.compact
104
+ .map { |x| Encoding.find(x) rescue nil }.compact
105
+ .push(Encoding::UTF_8)
106
+ .first
107
+ end
108
+ end
@@ -0,0 +1,6 @@
1
+ require 'bundler/setup'
2
+ Bundler.require
3
+
4
+ require 'webmock/rspec'
5
+
6
+ Dir["#{File.dirname(__FILE__)}/support/**/*.rb"].each { |f| require f }
data/spec/test.html ADDED
@@ -0,0 +1 @@
1
+ <html>Test</html>
@@ -0,0 +1,18 @@
1
+ require 'spec_helper'
2
+
3
+ class DummUrlReader
4
+ include UrlReader
5
+ end
6
+
7
+ describe UrlReader do
8
+ let(:content) { File.read(File.expand_path('../test.html', __FILE__)) }
9
+ let(:url) { 'http://www.example.com/test.html' }
10
+ let(:read) { -> { DummUrlReader.new.read_url(url) } }
11
+
12
+ describe '#read_url' do
13
+ it 'should read url' do
14
+ stub_request(:any, url).to_return(body: content)
15
+ expect(read.call).to eq content
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,15 @@
1
+ Gem::Specification.new do |s|
2
+ s.name = 'url_reader'
3
+ s.version = '0.1.0'
4
+ s.authors = ['Tetsuri Moriya']
5
+ s.email = ['tetsuri.moriya@gmail.com']
6
+ s.summary = 'Url reader'
7
+ s.description = 'Web retrieval module with cache'
8
+ s.homepage = 'https://github.com/pandora2000/url_reader'
9
+ s.license = 'MIT'
10
+ s.files = `git ls-files`.split("\n")
11
+ s.add_development_dependency 'rspec', '>= 0'
12
+ s.add_development_dependency 'webmock', '>= 0'
13
+ s.add_runtime_dependency 'activesupport', '>= 4'
14
+ s.add_runtime_dependency 'rest-client', '>= 0'
15
+ end
metadata ADDED
@@ -0,0 +1,116 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: url_reader
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Tetsuri Moriya
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-11-23 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rspec
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: webmock
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: activesupport
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '4'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '4'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rest-client
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ description: Web retrieval module with cache
70
+ email:
71
+ - tetsuri.moriya@gmail.com
72
+ executables: []
73
+ extensions: []
74
+ extra_rdoc_files: []
75
+ files:
76
+ - ".gitignore"
77
+ - ".ruby-version"
78
+ - Gemfile
79
+ - Gemfile.lock
80
+ - README.md
81
+ - lib/initialize.rb
82
+ - lib/url_reader.rb
83
+ - lib/url_reader/base_error.rb
84
+ - lib/url_reader/cannot_resolve_encoding_error.rb
85
+ - lib/url_reader/file_cache.rb
86
+ - lib/url_reader/read_error.rb
87
+ - lib/url_reader/url_fixer.rb
88
+ - spec/spec_helper.rb
89
+ - spec/test.html
90
+ - spec/url_reader_spec.rb
91
+ - url_reader.gemspec
92
+ homepage: https://github.com/pandora2000/url_reader
93
+ licenses:
94
+ - MIT
95
+ metadata: {}
96
+ post_install_message:
97
+ rdoc_options: []
98
+ require_paths:
99
+ - lib
100
+ required_ruby_version: !ruby/object:Gem::Requirement
101
+ requirements:
102
+ - - ">="
103
+ - !ruby/object:Gem::Version
104
+ version: '0'
105
+ required_rubygems_version: !ruby/object:Gem::Requirement
106
+ requirements:
107
+ - - ">="
108
+ - !ruby/object:Gem::Version
109
+ version: '0'
110
+ requirements: []
111
+ rubyforge_project:
112
+ rubygems_version: 2.2.2
113
+ signing_key:
114
+ specification_version: 4
115
+ summary: Url reader
116
+ test_files: []