url_reader 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +2 -0
- data/.ruby-version +1 -0
- data/Gemfile +3 -0
- data/Gemfile.lock +55 -0
- data/README.md +0 -0
- data/lib/initialize.rb +7 -0
- data/lib/url_reader/base_error.rb +18 -0
- data/lib/url_reader/cannot_resolve_encoding_error.rb +2 -0
- data/lib/url_reader/file_cache.rb +48 -0
- data/lib/url_reader/read_error.rb +20 -0
- data/lib/url_reader/url_fixer.rb +7 -0
- data/lib/url_reader.rb +108 -0
- data/spec/spec_helper.rb +6 -0
- data/spec/test.html +1 -0
- data/spec/url_reader_spec.rb +18 -0
- data/url_reader.gemspec +15 -0
- metadata +116 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 90d8ea16e1c937474887f35170886a5e3e283c5e
|
4
|
+
data.tar.gz: eec8a64973d2c4ecba39f266cf6a168a6e21145b
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 97e812e7b7fa2ef8f004b86149ef9e28de861daa8190063de2ef0e2aec6e60b55a4fbcd9bd2688ae7fb25c62fe2b69037e7ac521c2042205180d773ce2a1be50
|
7
|
+
data.tar.gz: b7b3e385fdb44e1b58cbf71348556accc336d5ca3e0eb2ea1ef254b1d85cde697e2b1e31431ad3c47588334b6a846c6dfb779f032551892b8f2899117c11417a
|
data/.gitignore
ADDED
data/.ruby-version
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
2.1.2
|
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,55 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
url_reader (0.1.0)
|
5
|
+
activesupport (>= 4)
|
6
|
+
rest-client
|
7
|
+
|
8
|
+
GEM
|
9
|
+
remote: https://rubygems.org/
|
10
|
+
specs:
|
11
|
+
activesupport (4.1.8)
|
12
|
+
i18n (~> 0.6, >= 0.6.9)
|
13
|
+
json (~> 1.7, >= 1.7.7)
|
14
|
+
minitest (~> 5.1)
|
15
|
+
thread_safe (~> 0.1)
|
16
|
+
tzinfo (~> 1.1)
|
17
|
+
addressable (2.3.6)
|
18
|
+
crack (0.4.2)
|
19
|
+
safe_yaml (~> 1.0.0)
|
20
|
+
diff-lcs (1.2.5)
|
21
|
+
i18n (0.6.11)
|
22
|
+
json (1.8.1)
|
23
|
+
mime-types (2.4.3)
|
24
|
+
minitest (5.4.3)
|
25
|
+
netrc (0.8.0)
|
26
|
+
rest-client (1.7.2)
|
27
|
+
mime-types (>= 1.16, < 3.0)
|
28
|
+
netrc (~> 0.7)
|
29
|
+
rspec (3.1.0)
|
30
|
+
rspec-core (~> 3.1.0)
|
31
|
+
rspec-expectations (~> 3.1.0)
|
32
|
+
rspec-mocks (~> 3.1.0)
|
33
|
+
rspec-core (3.1.7)
|
34
|
+
rspec-support (~> 3.1.0)
|
35
|
+
rspec-expectations (3.1.2)
|
36
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
37
|
+
rspec-support (~> 3.1.0)
|
38
|
+
rspec-mocks (3.1.3)
|
39
|
+
rspec-support (~> 3.1.0)
|
40
|
+
rspec-support (3.1.2)
|
41
|
+
safe_yaml (1.0.4)
|
42
|
+
thread_safe (0.3.4)
|
43
|
+
tzinfo (1.2.2)
|
44
|
+
thread_safe (~> 0.1)
|
45
|
+
webmock (1.20.3)
|
46
|
+
addressable (>= 2.3.6)
|
47
|
+
crack (>= 0.3.2)
|
48
|
+
|
49
|
+
PLATFORMS
|
50
|
+
ruby
|
51
|
+
|
52
|
+
DEPENDENCIES
|
53
|
+
rspec
|
54
|
+
url_reader!
|
55
|
+
webmock
|
data/README.md
ADDED
File without changes
|
data/lib/initialize.rb
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
class UrlReader::BaseError < StandardError
|
2
|
+
def self.msg(error, additional_msg = nil)
|
3
|
+
msg = "#{error.class.name}: #{error.message}"
|
4
|
+
if additional_msg
|
5
|
+
msg += ", #{additional_msg}"
|
6
|
+
end
|
7
|
+
msg
|
8
|
+
end
|
9
|
+
|
10
|
+
def initialize(inner_or_msg = nil, additional_msg = nil)
|
11
|
+
if inner_or_msg.is_a?(String)
|
12
|
+
super(inner_or_msg)
|
13
|
+
else
|
14
|
+
super(self.class.msg(inner_or_msg, additional_msg))
|
15
|
+
set_backtrace(inner_or_msg.backtrace)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
class Util::UrlReader::FileCache
|
2
|
+
def initialize(cache_dir_path)
|
3
|
+
@cache_dir_path = cache_dir_path
|
4
|
+
@cache = {}
|
5
|
+
end
|
6
|
+
|
7
|
+
def read_entry(key)
|
8
|
+
unless @cache.has_key?(key)
|
9
|
+
ekey = encoded_key(key)
|
10
|
+
hash = hash(ekey)
|
11
|
+
file_path = File.join(@cache_dir_path, hash)
|
12
|
+
value = nil
|
13
|
+
if File.exist?(file_path)
|
14
|
+
value = (decoded_value(File.open(file_path).read.strip.split("\n")
|
15
|
+
.select { |x| x.start_with?("#{ekey}\t") }[0].split("\t", 2)[1]) rescue nil)
|
16
|
+
end
|
17
|
+
@cache[key] = value
|
18
|
+
end
|
19
|
+
@cache[key]
|
20
|
+
end
|
21
|
+
|
22
|
+
def write_entry(key, value)
|
23
|
+
@cache[key] = value
|
24
|
+
ekey = encoded_key(key)
|
25
|
+
hash = hash(ekey)
|
26
|
+
file_path = File.join(@cache_dir_path, hash)
|
27
|
+
File.open(file_path, 'a') { |f| f.puts("#{ekey}\t#{encoded_value(value)}") }
|
28
|
+
true
|
29
|
+
end
|
30
|
+
|
31
|
+
private
|
32
|
+
|
33
|
+
def encoded_key(key)
|
34
|
+
URI.encode_www_form_component(key)
|
35
|
+
end
|
36
|
+
|
37
|
+
def encoded_value(value)
|
38
|
+
CGI.escape(value)
|
39
|
+
end
|
40
|
+
|
41
|
+
def decoded_value(value)
|
42
|
+
CGI.unescape(value)
|
43
|
+
end
|
44
|
+
|
45
|
+
def hash(key)
|
46
|
+
Digest::SHA256.hexdigest(key)
|
47
|
+
end
|
48
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
class UrlReader::ReadError < UrlReader::BaseError
|
2
|
+
PageNotFound = 0
|
3
|
+
RequestTimeout = 1
|
4
|
+
UnidentifiedError = 2
|
5
|
+
|
6
|
+
attr_reader :type
|
7
|
+
|
8
|
+
def initialize(*args)
|
9
|
+
super(*args)
|
10
|
+
inner = args[0]
|
11
|
+
@type =
|
12
|
+
if inner.is_a?(RestClient::ResourceNotFound)
|
13
|
+
PageNotFound
|
14
|
+
elsif inner.is_a?(RestClient::RequestTimeout)
|
15
|
+
RequestTimeout
|
16
|
+
else
|
17
|
+
UnidentifiedError
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
data/lib/url_reader.rb
ADDED
@@ -0,0 +1,108 @@
|
|
1
|
+
require_relative 'initialize'
|
2
|
+
|
3
|
+
require 'kconv'
|
4
|
+
|
5
|
+
module UrlReader
|
6
|
+
extend ActiveSupport::Concern
|
7
|
+
|
8
|
+
include UrlFixer
|
9
|
+
|
10
|
+
REQUEST_TIMEOUT = 10
|
11
|
+
REQUEST_OPEN_TIMEOUT = 10
|
12
|
+
|
13
|
+
def read_url(url, options = {})
|
14
|
+
if defined?(Rails) && Rails.env.development?
|
15
|
+
if ENV['READ_URL_CACHE_NOT_USE'] != 'true'
|
16
|
+
ckey = cache_key(url, options)
|
17
|
+
res = cache.read_entry(ckey)
|
18
|
+
res || read_url_core_with_cache_write(url, options, ckey)
|
19
|
+
else
|
20
|
+
read_url_core_with_cache_write(url, options)
|
21
|
+
end
|
22
|
+
else
|
23
|
+
read_url_core(url, options)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
private
|
28
|
+
|
29
|
+
def cache
|
30
|
+
@cache ||= begin
|
31
|
+
%x(mkdir -p #{Rails.root}/tmp/cache/url_reader)
|
32
|
+
FileCache.new(File.join(Rails.root, 'tmp/cache/url_reader'))
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def cache_key(url, options)
|
37
|
+
"#{url}?#{options.to_s}"
|
38
|
+
end
|
39
|
+
|
40
|
+
def read_url_core_with_cache_write(url, options, ckey = nil)
|
41
|
+
ckey ||= cache_key(url, options)
|
42
|
+
res = read_url_core(url, options)
|
43
|
+
return nil if res.nil?
|
44
|
+
cache.write_entry(ckey, res)
|
45
|
+
res
|
46
|
+
end
|
47
|
+
|
48
|
+
def read_url_core(url, options)
|
49
|
+
valid_url = fixed_url(url)
|
50
|
+
headers = {}
|
51
|
+
headers[:user_agent] = options[:user_agent] if options[:user_agent]
|
52
|
+
hash = {
|
53
|
+
url: valid_url,
|
54
|
+
timeout: options[:request_timeout] || REQUEST_TIMEOUT,
|
55
|
+
open_timeout: options[:request_open_timeout] || REQUEST_OPEN_TIMEOUT,
|
56
|
+
headers: headers
|
57
|
+
}
|
58
|
+
response =
|
59
|
+
begin
|
60
|
+
if options[:method] == :post
|
61
|
+
RestClient::Request.execute(hash.merge(method: :post, payload: options[:params]))
|
62
|
+
else
|
63
|
+
RestClient::Request.execute(hash.merge(method: :get))
|
64
|
+
end
|
65
|
+
rescue RestClient::ResourceNotFound,
|
66
|
+
RestClient::InternalServerError,
|
67
|
+
RestClient::RequestTimeout,
|
68
|
+
RestClient::ServerBrokeConnection,
|
69
|
+
Errno::ECONNREFUSED,
|
70
|
+
Errno::ECONNRESET => e
|
71
|
+
ne = ReadError.new(e, "Read #{hash[:url]} failed")
|
72
|
+
if options[:ignore_not_found]
|
73
|
+
options[:ignore_read_errors] ||= []
|
74
|
+
options[:ignore_read_errors] << 'PageNotFound'
|
75
|
+
end
|
76
|
+
if ignore_errors = options[:ignore_read_errors]
|
77
|
+
return nil if ignore_errors.map { |x| x.is_a?(Integer) ? x : ReadError.const_get(x) }.include?(ne.type)
|
78
|
+
end
|
79
|
+
raise ne
|
80
|
+
end
|
81
|
+
return nil unless response
|
82
|
+
return resolve_encoding(response) if response.headers[:content_type] !~ /^image\//
|
83
|
+
response.to_str
|
84
|
+
end
|
85
|
+
|
86
|
+
def resolve_encoding(response)
|
87
|
+
response_str = response.to_str
|
88
|
+
encoding = response_encoding(response.headers, response_str)
|
89
|
+
begin
|
90
|
+
return response_str.encode(Encoding::UTF_8, encoding)
|
91
|
+
rescue Encoding::UndefinedConversionError => e
|
92
|
+
return response_str.encode(Encoding::UTF_8, Encoding::CP932) if encoding == Encoding::Shift_JIS
|
93
|
+
return response_str.encode(Encoding::UTF_8, Encoding::CP51932) if encoding == Encoding::EUC_JP
|
94
|
+
raise CannotResolveEncodingError, e
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
def response_encoding(response_headers, response_str)
|
99
|
+
response_str_utf8 = response_str.toutf8
|
100
|
+
[response_headers[:content_type].try(:match, /charset=(?<charset>[^;]+)($|;)/),
|
101
|
+
response_str_utf8.match(/<meta .*?content="[^"]*?charset=(?<charset>[^;"]+)/),
|
102
|
+
response_str_utf8.match(/<meta .*?charset="(?<charset>[^"]+)"/)]
|
103
|
+
.map { |x| x.try(:[], 'charset') }.compact
|
104
|
+
.map { |x| Encoding.find(x) rescue nil }.compact
|
105
|
+
.push(Encoding::UTF_8)
|
106
|
+
.first
|
107
|
+
end
|
108
|
+
end
|
data/spec/spec_helper.rb
ADDED
data/spec/test.html
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
<html>Test</html>
|
@@ -0,0 +1,18 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
class DummUrlReader
|
4
|
+
include UrlReader
|
5
|
+
end
|
6
|
+
|
7
|
+
describe UrlReader do
|
8
|
+
let(:content) { File.read(File.expand_path('../test.html', __FILE__)) }
|
9
|
+
let(:url) { 'http://www.example.com/test.html' }
|
10
|
+
let(:read) { -> { DummUrlReader.new.read_url(url) } }
|
11
|
+
|
12
|
+
describe '#read_url' do
|
13
|
+
it 'should read url' do
|
14
|
+
stub_request(:any, url).to_return(body: content)
|
15
|
+
expect(read.call).to eq content
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
data/url_reader.gemspec
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
Gem::Specification.new do |s|
|
2
|
+
s.name = 'url_reader'
|
3
|
+
s.version = '0.1.0'
|
4
|
+
s.authors = ['Tetsuri Moriya']
|
5
|
+
s.email = ['tetsuri.moriya@gmail.com']
|
6
|
+
s.summary = 'Url reader'
|
7
|
+
s.description = 'Web retrieval module with cache'
|
8
|
+
s.homepage = 'https://github.com/pandora2000/url_reader'
|
9
|
+
s.license = 'MIT'
|
10
|
+
s.files = `git ls-files`.split("\n")
|
11
|
+
s.add_development_dependency 'rspec', '>= 0'
|
12
|
+
s.add_development_dependency 'webmock', '>= 0'
|
13
|
+
s.add_runtime_dependency 'activesupport', '>= 4'
|
14
|
+
s.add_runtime_dependency 'rest-client', '>= 0'
|
15
|
+
end
|
metadata
ADDED
@@ -0,0 +1,116 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: url_reader
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Tetsuri Moriya
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-11-23 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: rspec
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: webmock
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: activesupport
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '4'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '4'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: rest-client
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
description: Web retrieval module with cache
|
70
|
+
email:
|
71
|
+
- tetsuri.moriya@gmail.com
|
72
|
+
executables: []
|
73
|
+
extensions: []
|
74
|
+
extra_rdoc_files: []
|
75
|
+
files:
|
76
|
+
- ".gitignore"
|
77
|
+
- ".ruby-version"
|
78
|
+
- Gemfile
|
79
|
+
- Gemfile.lock
|
80
|
+
- README.md
|
81
|
+
- lib/initialize.rb
|
82
|
+
- lib/url_reader.rb
|
83
|
+
- lib/url_reader/base_error.rb
|
84
|
+
- lib/url_reader/cannot_resolve_encoding_error.rb
|
85
|
+
- lib/url_reader/file_cache.rb
|
86
|
+
- lib/url_reader/read_error.rb
|
87
|
+
- lib/url_reader/url_fixer.rb
|
88
|
+
- spec/spec_helper.rb
|
89
|
+
- spec/test.html
|
90
|
+
- spec/url_reader_spec.rb
|
91
|
+
- url_reader.gemspec
|
92
|
+
homepage: https://github.com/pandora2000/url_reader
|
93
|
+
licenses:
|
94
|
+
- MIT
|
95
|
+
metadata: {}
|
96
|
+
post_install_message:
|
97
|
+
rdoc_options: []
|
98
|
+
require_paths:
|
99
|
+
- lib
|
100
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
101
|
+
requirements:
|
102
|
+
- - ">="
|
103
|
+
- !ruby/object:Gem::Version
|
104
|
+
version: '0'
|
105
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
106
|
+
requirements:
|
107
|
+
- - ">="
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: '0'
|
110
|
+
requirements: []
|
111
|
+
rubyforge_project:
|
112
|
+
rubygems_version: 2.2.2
|
113
|
+
signing_key:
|
114
|
+
specification_version: 4
|
115
|
+
summary: Url reader
|
116
|
+
test_files: []
|