web_scraping_helper 0.5.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 02ab48dc934b462c435c3cc75f1223f10624ce22
4
+ data.tar.gz: 1a1e020edcecc0ba2af3e2b2a67a7c0cd224e4a1
5
+ SHA512:
6
+ metadata.gz: bb4312d25cddd201cfec98a8d24972d8ea96395d99451d0ca28224db0aee1b2be888fb269f7b3bc5dc014fdfbd005de25f19d1fe4d95b8340a73751ddb33f845
7
+ data.tar.gz: b8895c65fe159fc1c8e0f152807f292e9eaabb53e945c479f5cb74b294ff0d2ac859982f2b0f959900e2f435864916ddb356ecfb7870c0ceee0988684f6e6779
data/.gitignore ADDED
@@ -0,0 +1,10 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
10
+ /vendor
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in web_scraping_helper.gemspec
4
+ gemspec
data/Guardfile ADDED
@@ -0,0 +1,6 @@
1
+ guard :minitest do
2
+ # with Minitest::Unit
3
+ watch(%r{^test/(.*)\/?(.*)_test\.rb$})
4
+ watch(%r{^lib/(.*/)?([^/]+)\.rb$}) { |m| "test/#{m[1]}#{m[2]}_test.rb" }
5
+ watch(%r{^test/test_helper\.rb$}) { 'test' }
6
+ end
data/README.md ADDED
@@ -0,0 +1,36 @@
1
+ # WebScrapingHelper
2
+
3
+ Welcome to your new gem! In this directory, you'll find the files you need to be able to package up your Ruby library into a gem. Put your Ruby code in the file `lib/web_scraping_helper`. To experiment with that code, run `bin/console` for an interactive prompt.
4
+
5
+ TODO: Delete this and the text above, and describe your gem
6
+
7
+ ## Installation
8
+
9
+ Add this line to your application's Gemfile:
10
+
11
+ ```ruby
12
+ gem 'web_scraping_helper'
13
+ ```
14
+
15
+ And then execute:
16
+
17
+ $ bundle
18
+
19
+ Or install it yourself as:
20
+
21
+ $ gem install web_scraping_helper
22
+
23
+ ## Usage
24
+
25
+ TODO: Write usage instructions here
26
+
27
+ ## Development
28
+
29
+ After checking out the repo, run `bin/setup` to install dependencies. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
30
+
31
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
32
+
33
+ ## Contributing
34
+
35
+ Bug reports and pull requests are welcome on GitHub at https://github.com/[USERNAME]/web_scraping_helper.
36
+
data/Rakefile ADDED
@@ -0,0 +1,10 @@
1
+ require "bundler/gem_tasks"
2
+ require "rake/testtask"
3
+
4
+ Rake::TestTask.new(:test) do |t|
5
+ t.libs << "test"
6
+ t.libs << "lib"
7
+ t.test_files = FileList['test/**/*_test.rb']
8
+ end
9
+
10
+ task :default => :test
data/bin/console ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "web_scraping_helper"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start
data/bin/setup ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,3 @@
1
+ class WebScrapingHelper
2
+ VERSION = "0.5.1"
3
+ end
@@ -0,0 +1,146 @@
1
+ #-*- encoding: utf-8 -*-
2
+ require "web_scraping_helper/version"
3
+ require 'rest-client'
4
+ require 'fileutils'
5
+
6
+ class WebScrapingHelper
7
+ DEFAULT_USER_AGENT = 'Mozilla/5.0'
8
+ DEFAULT_WAIT_TIME = 1
9
+ DEFAULT_ENCODING = Encoding::UTF_8
10
+
11
+ def self.reset!
12
+ @@global_cache_dir = nil
13
+ @@blocks = []
14
+ end
15
+ reset!
16
+
17
+ def self.cache_dir
18
+ @@global_cache_dir
19
+ end
20
+
21
+ def self.cache_dir=(v)
22
+ @@global_cache_dir = v
23
+ end
24
+
25
+ def self.before(&block)
26
+ @@blocks << {timing: :before, proc: block}
27
+ end
28
+
29
+ def self.after(&block)
30
+ @@blocks << {timing: :after, proc: block}
31
+ end
32
+
33
+ def initialize(cookie_filename = nil)
34
+ @jar = HTTP::CookieJar.new
35
+ if cookie_filename
36
+ @jar.load(cookie_filename) if File.exist?(cookie_filename)
37
+ @cookie_filename = cookie_filename
38
+ end
39
+ end
40
+
41
+ attr_accessor :user_agent, :wait_time, :encoding
42
+ attr_accessor :cache_dir
43
+
44
+ def post(url, opts = {})
45
+ request_http(:post, url, opts)
46
+ end
47
+ alias post_http post # support old method
48
+
49
+ def get(url, opts = {})
50
+ request_http(:get, url, opts)
51
+ end
52
+ alias get_http get # support old method
53
+
54
+ def exist_cookie?(url)
55
+ cookie = HTTP::Cookie.cookie_value(@jar.cookies(url))
56
+ not cookie.empty?
57
+ end
58
+
59
+ private
60
+
61
+ def request_http(request_method, url, opts)
62
+ @@blocks
63
+ .select{|block| block[:timing] == :before}
64
+ .each{|block| block[:proc].call(url) }
65
+
66
+ if request_method == :get && (res = find_cache(url))
67
+ return res
68
+ end
69
+ wait
70
+
71
+ headers = {}
72
+ opts.each{|k, v| headers[k.downcase] = v if String === k}
73
+ unless headers.key?("user-agent")
74
+ headers["user-agent"] = @user_agent || DEFAULT_USER_AGENT
75
+ end
76
+ unless headers.key?("cookie")
77
+ cookie = HTTP::Cookie.cookie_value(@jar.cookies(url))
78
+ headers["cookie"] = cookie unless cookie.empty?
79
+ end
80
+
81
+ params = {
82
+ method: request_method,
83
+ url: url,
84
+ headers: headers
85
+ }
86
+ params[:payload] = opts[:body] if opts[:body]
87
+ res = RestClient::Request.execute(params)
88
+
89
+ encoding = opts[:encoding] || get_encoding(res)
90
+ if encoding
91
+ res.force_encoding(encoding)
92
+ res.encode!(@encoding || DEFAULT_ENCODING)
93
+ end
94
+
95
+ cookies = res.headers[:set_cookie]
96
+ if cookies
97
+ cookies.each{|cookie| @jar.parse(cookie, url)}
98
+ @jar.save(@cookie_filename) if @cookie_filename
99
+ end
100
+
101
+ set_wait_base_time
102
+ save_cache(url, res)
103
+
104
+ @@blocks
105
+ .select{|block| block[:timing] == :after}
106
+ .each{|block| block[:proc].call(url, res) }
107
+
108
+ res
109
+ end
110
+
111
+ def get_encoding(res)
112
+ content_type = res.headers[:content_type]
113
+ return content_type[/;\s*charset=([^;]+)/, 1] if content_type
114
+ end
115
+
116
+ def find_cache(url)
117
+ cache_file = url_to_cache_path(url)
118
+ return nil if cache_file.nil? or not File.exist?(cache_file)
119
+ File.open(cache_file, "r:utf-8"){|f| f.read}
120
+ end
121
+
122
+ def save_cache(url, html)
123
+ cache_file = url_to_cache_path(url)
124
+ return if cache_file.nil?
125
+ FileUtils.mkdir_p(File.dirname(cache_file))
126
+ File.open(cache_file, "w+:utf-8"){|f| f.print html}
127
+ end
128
+
129
+ def url_to_cache_path(url)
130
+ cache_dir = @cache_dir || @@global_cache_dir
131
+ return unless cache_dir
132
+ File.expand_path(url.gsub(%r{[\\/:\?"<>\|]}, "_"), cache_dir)
133
+ end
134
+
135
+ def wait
136
+ if not @prev_time.nil?
137
+ wait_time = @wait_time || DEFAULT_WAIT_TIME
138
+ wait_time -= (Time.now - @prev_time)
139
+ sleep wait_time if wait_time > 0
140
+ end
141
+ end
142
+
143
+ def set_wait_base_time
144
+ @prev_time = Time.now
145
+ end
146
+ end
@@ -0,0 +1,39 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'web_scraping_helper/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "web_scraping_helper"
8
+ spec.version = WebScrapingHelper::VERSION
9
+ spec.authors = ["momo-lab"]
10
+ spec.email = ["momotaro.n@gmail.com"]
11
+
12
+ spec.summary = "WebScrapingHelper"
13
+ spec.description = "WebScrapingHelper"
14
+ spec.homepage = "https://github.com/momo-lab/web_scraping_helper"
15
+
16
+ # # Prevent pushing this gem to RubyGems.org by setting 'allowed_push_host', or
17
+ # # delete this section to allow pushing this gem to any host.
18
+ # if spec.respond_to?(:metadata)
19
+ # spec.metadata['allowed_push_host'] = "TODO: Set to 'http://mygemserver.com'"
20
+ # else
21
+ # raise "RubyGems 2.0 or newer is required to protect against public gem pushes."
22
+ # end
23
+
24
+ spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
25
+ spec.bindir = "exe"
26
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
27
+ spec.require_paths = ["lib"]
28
+
29
+ spec.add_dependency "rest-client"
30
+
31
+ spec.add_development_dependency "bundler", "~> 1.11"
32
+ spec.add_development_dependency "rake", "~> 10.0"
33
+ spec.add_development_dependency "minitest", "~> 5.0"
34
+ spec.add_development_dependency "minitest-reporters"
35
+ spec.add_development_dependency "minitest-power_assert"
36
+ spec.add_development_dependency "webmock", "~> 2.0"
37
+ spec.add_development_dependency "guard"
38
+ spec.add_development_dependency "guard-minitest"
39
+ end
metadata ADDED
@@ -0,0 +1,179 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: web_scraping_helper
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.5.1
5
+ platform: ruby
6
+ authors:
7
+ - momo-lab
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2016-06-05 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rest-client
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: bundler
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '1.11'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '1.11'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '10.0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '10.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: minitest
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '5.0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '5.0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: minitest-reporters
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: minitest-power_assert
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: webmock
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - "~>"
102
+ - !ruby/object:Gem::Version
103
+ version: '2.0'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - "~>"
109
+ - !ruby/object:Gem::Version
110
+ version: '2.0'
111
+ - !ruby/object:Gem::Dependency
112
+ name: guard
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - ">="
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - ">="
123
+ - !ruby/object:Gem::Version
124
+ version: '0'
125
+ - !ruby/object:Gem::Dependency
126
+ name: guard-minitest
127
+ requirement: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - ">="
130
+ - !ruby/object:Gem::Version
131
+ version: '0'
132
+ type: :development
133
+ prerelease: false
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - ">="
137
+ - !ruby/object:Gem::Version
138
+ version: '0'
139
+ description: WebScrapingHelper
140
+ email:
141
+ - momotaro.n@gmail.com
142
+ executables: []
143
+ extensions: []
144
+ extra_rdoc_files: []
145
+ files:
146
+ - ".gitignore"
147
+ - Gemfile
148
+ - Guardfile
149
+ - README.md
150
+ - Rakefile
151
+ - bin/console
152
+ - bin/setup
153
+ - lib/web_scraping_helper.rb
154
+ - lib/web_scraping_helper/version.rb
155
+ - web_scraping_helper.gemspec
156
+ homepage: https://github.com/momo-lab/web_scraping_helper
157
+ licenses: []
158
+ metadata: {}
159
+ post_install_message:
160
+ rdoc_options: []
161
+ require_paths:
162
+ - lib
163
+ required_ruby_version: !ruby/object:Gem::Requirement
164
+ requirements:
165
+ - - ">="
166
+ - !ruby/object:Gem::Version
167
+ version: '0'
168
+ required_rubygems_version: !ruby/object:Gem::Requirement
169
+ requirements:
170
+ - - ">="
171
+ - !ruby/object:Gem::Version
172
+ version: '0'
173
+ requirements: []
174
+ rubyforge_project:
175
+ rubygems_version: 2.5.1
176
+ signing_key:
177
+ specification_version: 4
178
+ summary: WebScrapingHelper
179
+ test_files: []