web_scraping_helper 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 02ab48dc934b462c435c3cc75f1223f10624ce22
4
+ data.tar.gz: 1a1e020edcecc0ba2af3e2b2a67a7c0cd224e4a1
5
+ SHA512:
6
+ metadata.gz: bb4312d25cddd201cfec98a8d24972d8ea96395d99451d0ca28224db0aee1b2be888fb269f7b3bc5dc014fdfbd005de25f19d1fe4d95b8340a73751ddb33f845
7
+ data.tar.gz: b8895c65fe159fc1c8e0f152807f292e9eaabb53e945c479f5cb74b294ff0d2ac859982f2b0f959900e2f435864916ddb356ecfb7870c0ceee0988684f6e6779
data/.gitignore ADDED
@@ -0,0 +1,10 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
10
+ /vendor
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in web_scraping_helper.gemspec
4
+ gemspec
data/Guardfile ADDED
@@ -0,0 +1,6 @@
1
+ guard :minitest do
2
+ # with Minitest::Unit
3
+ watch(%r{^test/(.*)\/?(.*)_test\.rb$})
4
+ watch(%r{^lib/(.*/)?([^/]+)\.rb$}) { |m| "test/#{m[1]}#{m[2]}_test.rb" }
5
+ watch(%r{^test/test_helper\.rb$}) { 'test' }
6
+ end
data/README.md ADDED
@@ -0,0 +1,36 @@
1
+ # WebScrapingHelper
2
+
3
+ Welcome to your new gem! In this directory, you'll find the files you need to be able to package up your Ruby library into a gem. Put your Ruby code in the file `lib/web_scraping_helper`. To experiment with that code, run `bin/console` for an interactive prompt.
4
+
5
+ TODO: Delete this and the text above, and describe your gem
6
+
7
+ ## Installation
8
+
9
+ Add this line to your application's Gemfile:
10
+
11
+ ```ruby
12
+ gem 'web_scraping_helper'
13
+ ```
14
+
15
+ And then execute:
16
+
17
+ $ bundle
18
+
19
+ Or install it yourself as:
20
+
21
+ $ gem install web_scraping_helper
22
+
23
+ ## Usage
24
+
25
+ TODO: Write usage instructions here
26
+
27
+ ## Development
28
+
29
+ After checking out the repo, run `bin/setup` to install dependencies. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
30
+
31
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
32
+
33
+ ## Contributing
34
+
35
+ Bug reports and pull requests are welcome on GitHub at https://github.com/[USERNAME]/web_scraping_helper.
36
+
data/Rakefile ADDED
@@ -0,0 +1,10 @@
1
+ require "bundler/gem_tasks"
2
+ require "rake/testtask"
3
+
4
+ Rake::TestTask.new(:test) do |t|
5
+ t.libs << "test"
6
+ t.libs << "lib"
7
+ t.test_files = FileList['test/**/*_test.rb']
8
+ end
9
+
10
+ task :default => :test
data/bin/console ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "web_scraping_helper"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start
data/bin/setup ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,3 @@
1
+ class WebScrapingHelper
2
+ VERSION = "0.5.1"
3
+ end
@@ -0,0 +1,146 @@
1
+ #-*- encoding: utf-8 -*-
2
+ require "web_scraping_helper/version"
3
+ require 'rest-client'
4
+ require 'fileutils'
5
+
6
+ class WebScrapingHelper
7
+ DEFAULT_USER_AGENT = 'Mozilla/5.0'
8
+ DEFAULT_WAIT_TIME = 1
9
+ DEFAULT_ENCODING = Encoding::UTF_8
10
+
11
+ def self.reset!
12
+ @@global_cache_dir = nil
13
+ @@blocks = []
14
+ end
15
+ reset!
16
+
17
+ def self.cache_dir
18
+ @@global_cache_dir
19
+ end
20
+
21
+ def self.cache_dir=(v)
22
+ @@global_cache_dir = v
23
+ end
24
+
25
+ def self.before(&block)
26
+ @@blocks << {timing: :before, proc: block}
27
+ end
28
+
29
+ def self.after(&block)
30
+ @@blocks << {timing: :after, proc: block}
31
+ end
32
+
33
+ def initialize(cookie_filename = nil)
34
+ @jar = HTTP::CookieJar.new
35
+ if cookie_filename
36
+ @jar.load(cookie_filename) if File.exist?(cookie_filename)
37
+ @cookie_filename = cookie_filename
38
+ end
39
+ end
40
+
41
+ attr_accessor :user_agent, :wait_time, :encoding
42
+ attr_accessor :cache_dir
43
+
44
+ def post(url, opts = {})
45
+ request_http(:post, url, opts)
46
+ end
47
+ alias post_http post # support old method
48
+
49
+ def get(url, opts = {})
50
+ request_http(:get, url, opts)
51
+ end
52
+ alias get_http get # support old method
53
+
54
+ def exist_cookie?(url)
55
+ cookie = HTTP::Cookie.cookie_value(@jar.cookies(url))
56
+ not cookie.empty?
57
+ end
58
+
59
+ private
60
+
61
+ def request_http(request_method, url, opts)
62
+ @@blocks
63
+ .select{|block| block[:timing] == :before}
64
+ .each{|block| block[:proc].call(url) }
65
+
66
+ if request_method == :get && (res = find_cache(url))
67
+ return res
68
+ end
69
+ wait
70
+
71
+ headers = {}
72
+ opts.each{|k, v| headers[k.downcase] = v if String === k}
73
+ unless headers.key?("user-agent")
74
+ headers["user-agent"] = @user_agent || DEFAULT_USER_AGENT
75
+ end
76
+ unless headers.key?("cookie")
77
+ cookie = HTTP::Cookie.cookie_value(@jar.cookies(url))
78
+ headers["cookie"] = cookie unless cookie.empty?
79
+ end
80
+
81
+ params = {
82
+ method: request_method,
83
+ url: url,
84
+ headers: headers
85
+ }
86
+ params[:payload] = opts[:body] if opts[:body]
87
+ res = RestClient::Request.execute(params)
88
+
89
+ encoding = opts[:encoding] || get_encoding(res)
90
+ if encoding
91
+ res.force_encoding(encoding)
92
+ res.encode!(@encoding || DEFAULT_ENCODING)
93
+ end
94
+
95
+ cookies = res.headers[:set_cookie]
96
+ if cookies
97
+ cookies.each{|cookie| @jar.parse(cookie, url)}
98
+ @jar.save(@cookie_filename) if @cookie_filename
99
+ end
100
+
101
+ set_wait_base_time
102
+ save_cache(url, res)
103
+
104
+ @@blocks
105
+ .select{|block| block[:timing] == :after}
106
+ .each{|block| block[:proc].call(url, res) }
107
+
108
+ res
109
+ end
110
+
111
+ def get_encoding(res)
112
+ content_type = res.headers[:content_type]
113
+ return content_type[/;\s*charset=([^;]+)/, 1] if content_type
114
+ end
115
+
116
+ def find_cache(url)
117
+ cache_file = url_to_cache_path(url)
118
+ return nil if cache_file.nil? or not File.exist?(cache_file)
119
+ File.open(cache_file, "r:utf-8"){|f| f.read}
120
+ end
121
+
122
+ def save_cache(url, html)
123
+ cache_file = url_to_cache_path(url)
124
+ return if cache_file.nil?
125
+ FileUtils.mkdir_p(File.dirname(cache_file))
126
+ File.open(cache_file, "w+:utf-8"){|f| f.print html}
127
+ end
128
+
129
+ def url_to_cache_path(url)
130
+ cache_dir = @cache_dir || @@global_cache_dir
131
+ return unless cache_dir
132
+ File.expand_path(url.gsub(%r{[\\/:\?"<>\|]}, "_"), cache_dir)
133
+ end
134
+
135
+ def wait
136
+ if not @prev_time.nil?
137
+ wait_time = @wait_time || DEFAULT_WAIT_TIME
138
+ wait_time -= (Time.now - @prev_time)
139
+ sleep wait_time if wait_time > 0
140
+ end
141
+ end
142
+
143
+ def set_wait_base_time
144
+ @prev_time = Time.now
145
+ end
146
+ end
@@ -0,0 +1,39 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'web_scraping_helper/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "web_scraping_helper"
8
+ spec.version = WebScrapingHelper::VERSION
9
+ spec.authors = ["momo-lab"]
10
+ spec.email = ["momotaro.n@gmail.com"]
11
+
12
+ spec.summary = "WebScrapingHelper"
13
+ spec.description = "WebScrapingHelper"
14
+ spec.homepage = "https://github.com/momo-lab/web_scraping_helper"
15
+
16
+ # # Prevent pushing this gem to RubyGems.org by setting 'allowed_push_host', or
17
+ # # delete this section to allow pushing this gem to any host.
18
+ # if spec.respond_to?(:metadata)
19
+ # spec.metadata['allowed_push_host'] = "TODO: Set to 'http://mygemserver.com'"
20
+ # else
21
+ # raise "RubyGems 2.0 or newer is required to protect against public gem pushes."
22
+ # end
23
+
24
+ spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
25
+ spec.bindir = "exe"
26
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
27
+ spec.require_paths = ["lib"]
28
+
29
+ spec.add_dependency "rest-client"
30
+
31
+ spec.add_development_dependency "bundler", "~> 1.11"
32
+ spec.add_development_dependency "rake", "~> 10.0"
33
+ spec.add_development_dependency "minitest", "~> 5.0"
34
+ spec.add_development_dependency "minitest-reporters"
35
+ spec.add_development_dependency "minitest-power_assert"
36
+ spec.add_development_dependency "webmock", "~> 2.0"
37
+ spec.add_development_dependency "guard"
38
+ spec.add_development_dependency "guard-minitest"
39
+ end
metadata ADDED
@@ -0,0 +1,179 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: web_scraping_helper
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.5.1
5
+ platform: ruby
6
+ authors:
7
+ - momo-lab
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2016-06-05 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rest-client
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: bundler
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '1.11'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '1.11'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '10.0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '10.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: minitest
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '5.0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '5.0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: minitest-reporters
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: minitest-power_assert
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: webmock
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - "~>"
102
+ - !ruby/object:Gem::Version
103
+ version: '2.0'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - "~>"
109
+ - !ruby/object:Gem::Version
110
+ version: '2.0'
111
+ - !ruby/object:Gem::Dependency
112
+ name: guard
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - ">="
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - ">="
123
+ - !ruby/object:Gem::Version
124
+ version: '0'
125
+ - !ruby/object:Gem::Dependency
126
+ name: guard-minitest
127
+ requirement: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - ">="
130
+ - !ruby/object:Gem::Version
131
+ version: '0'
132
+ type: :development
133
+ prerelease: false
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - ">="
137
+ - !ruby/object:Gem::Version
138
+ version: '0'
139
+ description: WebScrapingHelper
140
+ email:
141
+ - momotaro.n@gmail.com
142
+ executables: []
143
+ extensions: []
144
+ extra_rdoc_files: []
145
+ files:
146
+ - ".gitignore"
147
+ - Gemfile
148
+ - Guardfile
149
+ - README.md
150
+ - Rakefile
151
+ - bin/console
152
+ - bin/setup
153
+ - lib/web_scraping_helper.rb
154
+ - lib/web_scraping_helper/version.rb
155
+ - web_scraping_helper.gemspec
156
+ homepage: https://github.com/momo-lab/web_scraping_helper
157
+ licenses: []
158
+ metadata: {}
159
+ post_install_message:
160
+ rdoc_options: []
161
+ require_paths:
162
+ - lib
163
+ required_ruby_version: !ruby/object:Gem::Requirement
164
+ requirements:
165
+ - - ">="
166
+ - !ruby/object:Gem::Version
167
+ version: '0'
168
+ required_rubygems_version: !ruby/object:Gem::Requirement
169
+ requirements:
170
+ - - ">="
171
+ - !ruby/object:Gem::Version
172
+ version: '0'
173
+ requirements: []
174
+ rubyforge_project:
175
+ rubygems_version: 2.5.1
176
+ signing_key:
177
+ specification_version: 4
178
+ summary: WebScrapingHelper
179
+ test_files: []