crawler_sample 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,15 @@
1
+ ---
2
+ !binary "U0hBMQ==":
3
+ metadata.gz: !binary |-
4
+ ZTEyODc4NDZlMmE3MWYyYzM5YzUyYmQ5ZjlkMmEwZTAxNGQ3NWMwMQ==
5
+ data.tar.gz: !binary |-
6
+ ZGE0ZTA0NzdhNTkwNGM5OWY4ZmE5NDU5ZWY5MjJmZjQzNWFjNjBjNw==
7
+ !binary "U0hBNTEy":
8
+ metadata.gz: !binary |-
9
+ MGZmYmZlZTRkYmUxYWU5NjNlMjcyMGNmNTZhOWI4YjcxZmVkNzBlOThlNTFm
10
+ OTU1ZWUyYzY3MWE5MGU1ZmQyYTA3YWNlMTAyMTc0ODIzZTJjZTZmNjNlNWNj
11
+ MDJjNDY5ODFjZGFhN2NiZjc5YzQxOTNkMmU4OGY4NGUxNGMzYzk=
12
+ data.tar.gz: !binary |-
13
+ NTI3ZWMxMTJkOWU1MzNhNDUzYTA2NzFmOWFlNTNlY2JjNjBkMjA0OWY5YjM1
14
+ NzMwMDE4YTI4MjVhOTgyN2ZkYTdjZjQ5YTg2Y2ZmNDAzMWM5ZTBlM2YzNzY3
15
+ YWVkMDc5YWJlZDRmMTc2MjVhZmFhN2ExYjViNTRjZmQ1MDJmN2I=
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in crawler_sample.gemspec
4
+ gemspec
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Shigeki Doumae
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,30 @@
1
+ # CrawlerSample
2
+
3
+ TODO: Write a gem description
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'crawler_sample'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install crawler_sample
18
+
19
+ ## Usage
20
+
21
+ This gem is a web crawler sample code.
22
+ So I do not reccmmend that you use.
23
+
24
+ ## Contributing
25
+
26
+ 1. Fork it
27
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
28
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
29
+ 4. Push to the branch (`git push origin my-new-feature`)
30
+ 5. Create new Pull Request
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
@@ -0,0 +1,24 @@
1
+ # -*- encoding: utf-8 -*-
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'crawler_sample/version'
5
+
6
+ Gem::Specification.new do |gem|
7
+ gem.name = "crawler_sample"
8
+ gem.version = CrawlerSample::VERSION
9
+ gem.authors = ["Shigeki Doumae"]
10
+ gem.email = ["shigeki.doumae@gmail.com"]
11
+ gem.description = %q{This gem is a web crawler sample code.So I don't reccmmend that you use.}
12
+ gem.summary = %q{This gem is a web crawler sample code.So I don't reccmmend that you use.}
13
+ gem.homepage = ""
14
+
15
+ gem.files = `git ls-files`.split($/)
16
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
17
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
18
+ gem.require_paths = ["lib"]
19
+
20
+ gem.add_dependency "nokogiri", "~>1.5.6"
21
+ #gem.add_dependency "open-uri"
22
+ #gem.add_dependency "kconv"
23
+
24
+ end
@@ -0,0 +1,80 @@
1
+ require "crawler_sample/version"
2
+ require 'crawler_sample/url'
3
+ require 'crawler_sample/string'
4
+ require "nokogiri"
5
+ require "open-uri"
6
+ require 'kconv'
7
+ module CrawlerSample
8
+ # Your code goes here...
9
+ attr_accessor :top_url, :target_urls, :exclude_urls, :deep_flg, :delay, :stop_flg, :crawl_stop_count
10
+
11
+ def crawl(url=nil)
12
+ raise "URL is Blank" if url.nil?
13
+ self.target_urls=[url]
14
+ self.exclude_urls=[]
15
+ target_scheme=URI.parse(url).scheme
16
+ target_host=URI.parse(url).host
17
+ error_cnt=0
18
+ crawl_page_cnt=0
19
+ self.crawl_stop_count = 1000 if self.crawl_stop_count.to_i <= 0
20
+ loop do
21
+ begin
22
+ break if self.target_urls.empty? || self.stop_flg==true
23
+ url = self.target_urls.pop
24
+ self.exclude_urls << url
25
+ begin
26
+ p "SuccessURL #{url}"
27
+ site_contents = self.scrape(url)
28
+ crawl_page_cnt += 1
29
+ rescue
30
+ p "ErrorURL #{url}"
31
+ end
32
+ site_contents.search("a").each do|anc|
33
+ #ホスト名がtargetのホストと違う場合はクロール対象外にする
34
+ begin; URI.parse(anc["href"]).host; rescue; next; end
35
+ next unless anc["href"].scan(/\.(jpg|jpeg|png|gif|bmp|zip|exe|pdf|lzh)/i).empty?
36
+ next if URI.parse(anc["href"]).host && target_host!=URI.parse(anc["href"]).host
37
+ anc["href"] = URI.parse(anc["href"]).path if URI.parse(anc["href"]).host
38
+ anc["href"] = anc["href"].gsub(/\/\.{1,2}/,"")
39
+ anc["href"] = "/#{anc["href"]}" if anc["href"][0] != "/"
40
+ self.target_urls << "#{target_scheme}://#{target_host}#{anc["href"]}".gsub(/\/\.{1,2}/,"")
41
+ self.target_urls = (self.target_urls - self.exclude_urls).uniq
42
+ end
43
+ yield site_contents
44
+ rescue => e
45
+ error_cnt +=1
46
+ p "error #{error_cnt} #{e}"
47
+ next
48
+ end
49
+ crawl_is_force_stop if error_cnt > 200 || crawl_page_cnt > self.crawl_stop_count
50
+ end
51
+ end
52
+
53
+ def crawl_is_force_stop
54
+ self.stop_flg=true
55
+ end
56
+
57
+ def crawl_from_url(url=nil)
58
+ raise "URL is Blank" if url.nil?
59
+ site_contents = self.scrape(url)
60
+ return if site_contents.nil?
61
+ yield site_contents
62
+ end
63
+
64
+ def crawl_delay=(delay=nil)
65
+ self.delay= delay.nil? ? 1 : delay
66
+ end
67
+
68
+ def crawl_delay
69
+ return self.delay.nil? ? 1 : self.delay
70
+ end
71
+
72
+ def scrape(url, option={})
73
+ #self.delay = option[:delay].present? ? option[:delay] : 1
74
+ sleep self.crawl_delay #delay
75
+ html=open(url,"r:binary","User-Agent"=>"Blue Field 0.5.0.1" ).read
76
+ return Nokogiri::HTML(html.toutf8, nil, 'utf-8')
77
+ rescue
78
+ raise FaildScrape
79
+ end
80
+ end
@@ -0,0 +1,14 @@
1
+ require "open-uri"
2
+ class String
3
+ include CrawlerSample
4
+
5
+ def crawler(&block)
6
+ url=Url.new(:value=>self)
7
+ raise "URL is invalid" unless url.valid?
8
+ crawl(url.value) do|contents|
9
+ block.call(contents) if block_given?
10
+ end
11
+ end
12
+ end
13
+
14
+
@@ -0,0 +1,19 @@
1
+ class Url
2
+ attr_accessor :scheme, :host, :value
3
+ def initialize(values = {})
4
+ values.each do |k, v|
5
+ self.send("#{k}=", v)
6
+ end
7
+ end
8
+
9
+ def valid?
10
+ begin
11
+ uri = URI.parse(self.value)
12
+ resp = uri.kind_of?(URI::HTTP)
13
+ return resp
14
+ rescue URI::InvalidURIError
15
+ return false
16
+ end
17
+ end
18
+
19
+ end
@@ -0,0 +1,3 @@
1
+ module CrawlerSample
2
+ VERSION = "0.0.1"
3
+ end
metadata ADDED
@@ -0,0 +1,67 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: crawler_sample
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Shigeki Doumae
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2013-06-11 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ~>
18
+ - !ruby/object:Gem::Version
19
+ version: 1.5.6
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ~>
25
+ - !ruby/object:Gem::Version
26
+ version: 1.5.6
27
+ description: This gem is a web crawler sample code.So I don't reccmmend that you use.
28
+ email:
29
+ - shigeki.doumae@gmail.com
30
+ executables: []
31
+ extensions: []
32
+ extra_rdoc_files: []
33
+ files:
34
+ - .gitignore
35
+ - Gemfile
36
+ - LICENSE.txt
37
+ - README.md
38
+ - Rakefile
39
+ - crawler_sample.gemspec
40
+ - lib/crawler_sample.rb
41
+ - lib/crawler_sample/string.rb
42
+ - lib/crawler_sample/url.rb
43
+ - lib/crawler_sample/version.rb
44
+ homepage: ''
45
+ licenses: []
46
+ metadata: {}
47
+ post_install_message:
48
+ rdoc_options: []
49
+ require_paths:
50
+ - lib
51
+ required_ruby_version: !ruby/object:Gem::Requirement
52
+ requirements:
53
+ - - ! '>='
54
+ - !ruby/object:Gem::Version
55
+ version: '0'
56
+ required_rubygems_version: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - ! '>='
59
+ - !ruby/object:Gem::Version
60
+ version: '0'
61
+ requirements: []
62
+ rubyforge_project:
63
+ rubygems_version: 2.0.3
64
+ signing_key:
65
+ specification_version: 4
66
+ summary: This gem is a web crawler sample code.So I don't reccmmend that you use.
67
+ test_files: []