list_spider 0.3.6 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: cc5222e3c9eff5b6009e41e29f146de7013ad0f0
4
- data.tar.gz: 20a2d9c24d07a8fc4c9b61440bd4cabe256ea0f3
2
+ SHA256:
3
+ metadata.gz: 197035f7521ba4c326c0181c7133afe4c5d7bacfc3246795dc32758dce40da64
4
+ data.tar.gz: 89d14776f4c041806b6b9e164b31e651d03746c74d83505d5a32c1aeeaa62aa2
5
5
  SHA512:
6
- metadata.gz: 83cb5db4b531c32cce3ae5bac6cca154f1b43c175c6659a48cf04c86b6353d1a1763a0500426f7f6b2eb370aec66e311af48727f581d087f5677ea3c345afec2
7
- data.tar.gz: fd3d2a42c7127b72396c98c189474b7431c5e92f874085c5d6939d765ec8105abf5d58d5ff6b6dc69ca31fb46f5cefb158ce13e8f828749f122b909182933035
6
+ metadata.gz: a1b38832345203ec036ff4f8e11fba1d92e8ec58674d05ef129784a9e274dcd03ef421fa3db6e38bc38d7bb1cf3c54b7d56cbb321a5340bbe197fe57099ed077
7
+ data.tar.gz: 43de7e093004c823abb3c51a053869fd294af7fee9f9724c499af572ead7d5ba79d7ab9bb16b2baae1e00a1d198f89fcfbbedc35f57a3a8ed00f7f785d40cbfc
data/.gitignore ADDED
@@ -0,0 +1,84 @@
1
+ *.gem
2
+ *.rbc
3
+ /.config
4
+ /coverage/
5
+ /InstalledFiles
6
+ /pkg/
7
+ /spec/reports/
8
+ /spec/examples.txt
9
+ /test/tmp/
10
+ /test/version_tmp/
11
+ /tmp/
12
+
13
+ ## Specific to RubyMotion:
14
+ .dat*
15
+ .repl_history
16
+ build/
17
+
18
+ ## Documentation cache and generated files:
19
+ /.yardoc/
20
+ /_yardoc/
21
+ /doc/
22
+ /rdoc/
23
+
24
+ ## Environment normalisation:
25
+ /.bundle/
26
+ /vendor/bundle
27
+ /lib/bundler/man/
28
+
29
+ # for a library or gem, you might want to ignore these files since the code is
30
+ # intended to run in multiple environments; otherwise, check them in:
31
+ # Gemfile.lock
32
+ # .ruby-version
33
+ # .ruby-gemset
34
+
35
+ # unless supporting rvm < 1.11.0 or doing something fancy, ignore this:
36
+ .rvmrc
37
+
38
+ .DS_Store
39
+ .AppleDouble
40
+ .LSOverride
41
+
42
+ # Icon must end with two \r
43
+ Icon
44
+
45
+
46
+ # Thumbnails
47
+ ._*
48
+
49
+ # Files that might appear in the root of a volume
50
+ .DocumentRevisions-V100
51
+ .fseventsd
52
+ .Spotlight-V100
53
+ .TemporaryItems
54
+ .Trashes
55
+ .VolumeIcon.icns
56
+
57
+ # Directories potentially created on remote AFP share
58
+ .AppleDB
59
+ .AppleDesktop
60
+ Network Trash Folder
61
+ Temporary Items
62
+ .apdisk
63
+
64
+ # Windows image file caches
65
+ Thumbs.db
66
+ ehthumbs.db
67
+
68
+ # Folder config file
69
+ Desktop.ini
70
+
71
+ # Recycle Bin used on file shares
72
+ $RECYCLE.BIN/
73
+
74
+ # Windows Installer files
75
+ *.cab
76
+ *.msi
77
+ *.msm
78
+ *.msp
79
+
80
+ # Windows shortcuts
81
+ *.lnk
82
+
83
+ rubocopresult
84
+ coolshell
data/.rubocop.yml ADDED
@@ -0,0 +1,48 @@
1
+ Metrics/LineLength:
2
+ Max: 120
3
+ Metrics/MethodLength:
4
+ Max: 50
5
+ Metrics/ParameterLists:
6
+ Max: 12
7
+ Metrics/AbcSize:
8
+ Max: 50
9
+ Metrics/CyclomaticComplexity:
10
+ Max: 10
11
+ Metrics/PerceivedComplexity:
12
+ Max: 10
13
+ Style/GuardClause:
14
+ MinBodyLength: 5
15
+ Style/AsciiComments:
16
+ Enabled: false
17
+ Style/Documentation:
18
+ Enabled: false
19
+ Lint/AmbiguousRegexpLiteral:
20
+ Enabled: false
21
+ Lint/DefEndAlignment:
22
+ AutoCorrect: true
23
+ Lint/EndAlignment:
24
+ AutoCorrect: true
25
+ Style/BracesAroundHashParameters:
26
+ Enabled: false
27
+ Style/ClassAndModuleChildren:
28
+ Enabled: false
29
+ Style/AutoResourceCleanup:
30
+ Enabled: true
31
+ Style/CollectionMethods:
32
+ Enabled: true
33
+ Style/Encoding:
34
+ Enabled: true
35
+ Style/MethodCalledOnDoEndBlock:
36
+ Enabled: true
37
+ Layout/MultilineAssignmentLayout:
38
+ Enabled: true
39
+ Style/OptionHash:
40
+ Enabled: true
41
+ Style/StringMethods:
42
+ Enabled: true
43
+ Style/SymbolArray:
44
+ Enabled: true
45
+ Style/NonNilCheck:
46
+ IncludeSemanticChanges: true
47
+ Style/Send:
48
+ Enabled: true
data/Gemfile ADDED
@@ -0,0 +1,6 @@
1
+ source 'https://rubygems.org'
2
+
3
+ git_source(:github) { |repo_name| "https://github.com/#{repo_name}" }
4
+
5
+ # Specify your gem's dependencies in list_spider.gemspec
6
+ gemspec
data/README.md ADDED
@@ -0,0 +1,186 @@
1
+ # list_spider
2
+
3
+ A url list spider based on em-http-request.
4
+
5
+ Many times we only need to spider by url list then parse them and spider again. This is for the purpose.
6
+
7
+ ## Features
8
+ * Duplicate url filtering (based on local path, so you can custom your behavior).
9
+
10
+ * Convert to UTF-8 support.
11
+
12
+ * Increased spider support (don't spider exist).
13
+
14
+ * Customize concurrent number and interval between task.
15
+
16
+ * Http options support.
17
+
18
+ ## Getting started
19
+
20
+ gem install list_spider
21
+
22
+ ## Use like this
23
+ ```ruby
24
+ require 'list_spider'
25
+
26
+ DOWNLOAD_DIR = 'coolshell/'
27
+
28
+ $next_list = []
29
+
30
+ def parse_index_item(file_name)
31
+ content = File.read(file_name)
32
+ doc = Nokogiri::HTML(content)
33
+ list_group = doc.css("h2.entry-title")
34
+ link_list = list_group.css("a")
35
+
36
+ link_list.each do |link|
37
+ href = link['href']
38
+ local_path = DOWNLOAD_DIR + link.content + ".html"
39
+ #or you can save them to database for later use
40
+ $next_list<< TaskStruct.new(href, local_path)
41
+ end
42
+ end
43
+
44
+ task_list = []
45
+ task_list << TaskStruct.new('https://coolshell.cn/', DOWNLOAD_DIR + 'index.html', parse_method: method(:parse_index_item))
46
+
47
+ ListSpider.get_list(task_list)
48
+ ListSpider.get_list($next_list, max: 60)
49
+
50
+ ```
51
+
52
+ ## Or in one step
53
+ ```ruby
54
+ require 'list_spider'
55
+
56
+ DOWNLOAD_DIR = 'coolshell/'
57
+
58
+ def parse_index_item(file_name)
59
+
60
+ content = File.read(file_name)
61
+ doc = Nokogiri::HTML(content)
62
+ list_group = doc.css("h2.entry-title")
63
+ link_list = list_group.css("a")
64
+
65
+ link_list.each do |link|
66
+ href = link['href']
67
+ local_path = DOWNLOAD_DIR + link.content + ".html"
68
+ ListSpider.add_task(TaskStruct.new(href, local_path))
69
+ end
70
+ end
71
+
72
+ #get_one is a simple function for one taskstruct situation
73
+ ListSpider.get_one(TaskStruct.new(
74
+ 'https://coolshell.cn/',
75
+ DOWNLOAD_DIR + 'index.html',
76
+ parse_method: method(:parse_index_item)),
77
+ max: 60)
78
+
79
+ ```
80
+
81
+ ## You can define parse method in four forms
82
+
83
+ ```ruby
84
+ def parse_response(file_name)
85
+ #...
86
+ end
87
+
88
+
89
+ # extra_data is passed by TaskStruct's extra_data param
90
+
91
+ def parse_response(file_name, extra_data)
92
+ #...
93
+ end
94
+
95
+
96
+ # response_header is a EventMachine::HttpResponseHeader object
97
+ # you can use it like this:
98
+ # response_header.status
99
+ # response_header.cookie
100
+ # response_header['Last-Modified']
101
+
102
+ def parse_response(file_name, extra_data, response_header)
103
+ response_header.status
104
+ response_header['Last-Modified']
105
+
106
+ #...
107
+ end
108
+
109
+ # req is a EventMachine::HttpClientOptions object
110
+ # you can use it like this:
111
+ # req.body
112
+ # req.headers
113
+ # req.uri
114
+ # req.host
115
+ # req.port
116
+ def parse_response(file_name, extra_data, response_header, req)
117
+ puts req.body
118
+ puts req.headers
119
+ puts req.uri
120
+ puts req.host
121
+ puts req.port
122
+
123
+ #...
124
+ end
125
+
126
+ ```
127
+
128
+ ## And there are many options you can use
129
+
130
+ ```ruby
131
+ TaskStruct.new(href, local_path, http_method: :get, params: {}, extra_data: nil, parse_method: nil, header: nil)
132
+ ```
133
+
134
+ ```ruby
135
+ #no concurrent limit (note: only use when list size is small)
136
+ ListSpider.get_list(down_list, interval: 0, max: ListSpider::NO_LIMIT_CONCURRENT)
137
+
138
+ #sleep random time, often used in site which limit spider
139
+ ListSpider.get_list(down_list, interval: ListSpider::RANDOM_TIME, max: 1)
140
+
141
+ #set random time range
142
+ ListSpider.get_list(down_list, interval: (1..10), max: 1)
143
+
144
+ ```
145
+
146
+ ###Options below will take effect in the whole program (set them before call get_list)
147
+
148
+ ```ruby
149
+ #set proxy
150
+ ListSpider.set_proxy(proxy_addr, proxy_port, username: nil, password: nil)
151
+
152
+ #set http header (if TaskStruct has header it will be used priority)
153
+ ListSpider.set_header_option(header_option)
154
+
155
+ #convert the file encoding to utf-8
156
+ ListSpider.convert_to_utf8 = true
157
+
158
+ #set connect timeout
159
+ ListSpider.connect_timeout = 2*60
160
+
161
+ #over write exist file
162
+ ListSpider.overwrite_exist = false
163
+
164
+ #set redirect depth
165
+ ListSpider.max_redirects = 10
166
+
167
+ ```
168
+
169
+ ## There is a util class to help check or delete unvalid file
170
+
171
+ ```ruby
172
+ FileFilter.delete(CustomConfig::DIR + '*', size_threshold: 300)
173
+
174
+ FileFilter.check(CustomConfig::DIR + '*', size_threshold: 300)
175
+
176
+ FileFilter.check_save_result(CustomConfig::DIR + '*', size_threshold: 300)
177
+
178
+ #params
179
+ FileFilter.delete(dir_pattern, size_threshold: 1000, cust_judge: nil)
180
+
181
+ FileFilter.check_save_result(dir_pattern, save_file_name: 'filtered_file.txt', size_threshold: 1000, cust_judge: nil)
182
+ ```
183
+
184
+ ### License
185
+
186
+ (MIT License) - Copyright (c) 2016 Charles Zhang
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ require 'bundler/gem_tasks'
2
+ task default: :spec
data/bin/console ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'bundler/setup'
4
+ require 'list_spider'
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require 'irb'
14
+ IRB.start(__FILE__)
data/bin/setup ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
data/check_code.sh ADDED
@@ -0,0 +1,3 @@
1
+ #!/bin/sh
2
+
3
+ rubocop -a -D -f simple -o rubocopresult
data/lib/list_spider.rb CHANGED
@@ -1,3 +1,4 @@
1
+ require 'list_spider/version'
1
2
  require 'em-http-request'
2
3
  require 'nokogiri'
3
4
  require 'fileutils'
@@ -108,7 +109,7 @@ module ListSpider
108
109
  end
109
110
  end
110
111
  succeed_list << e
111
- rescue => e
112
+ rescue StandardError => e
112
113
  puts e
113
114
  end
114
115
  end
@@ -122,7 +123,7 @@ module ListSpider
122
123
  if e.http_method == :get
123
124
  ret = SpiderHelper.direct_http_get(e.href, e.local_path, convert_to_utf8: @convert_to_utf8)
124
125
  elsif e.http_method == :post
125
- ret = SpiderHelper.direct_http_post(e.href, e.local_path, e.params, convert_to_utf8: @convert_to_utf8)
126
+ ret = SpiderHelper.direct_http_post(e.href, e.local_path, e.params, convert_to_utf8: @convert_to_utf8)
126
127
  end
127
128
 
128
129
  if ret
@@ -134,7 +135,7 @@ module ListSpider
134
135
 
135
136
  begin
136
137
  multi.add e.local_path, w
137
- rescue => exception
138
+ rescue StandardError => exception
138
139
  puts exception
139
140
  puts e.href
140
141
  puts e.local_path
@@ -248,7 +249,7 @@ module ListSpider
248
249
  end
249
250
 
250
251
  def get_list(down_list, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
251
- if interval.is_a?Range
252
+ if interval.is_a? Range
252
253
  @random_time_range = interval
253
254
  interval = RANDOM_TIME
254
255
  end
@@ -273,7 +274,7 @@ module ListSpider
273
274
  end
274
275
 
275
276
  def add_task(task)
276
- if task.is_a?Array
277
+ if task.is_a? Array
277
278
  need_down_list = filter_list(task)
278
279
  @down_list += need_down_list
279
280
  elsif task.is_a?TaskStruct
@@ -0,0 +1,3 @@
1
+ module ListSpider
2
+ VERSION = '1.0.0'.freeze
3
+ end
data/lib/spider_helper.rb CHANGED
@@ -27,7 +27,7 @@ module SpiderHelper
27
27
  else
28
28
  puts res
29
29
  end
30
- rescue => e
30
+ rescue StandardError => e
31
31
  puts e.backtrace
32
32
  puts e
33
33
  false
@@ -59,7 +59,7 @@ module SpiderHelper
59
59
  else
60
60
  puts res
61
61
  end
62
- rescue => e
62
+ rescue StandardError => e
63
63
  puts e
64
64
  false
65
65
  end
@@ -0,0 +1,31 @@
1
+
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'list_spider/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = 'list_spider'
8
+ spec.version = ListSpider::VERSION
9
+ spec.authors = ['Charles Zhang']
10
+ spec.email = ['gis05zc@163.com']
11
+
12
+ spec.summary = 'List Spider'
13
+ spec.description = 'A url list spider based on em-http-request.'
14
+ spec.homepage = 'https://github.com/chinazhangchao/list_spider'
15
+ spec.license = 'MIT'
16
+
17
+ spec.files =
18
+ `git ls-files -z`.split("\x0").reject do |f|
19
+ f.match(%r{^(test|spec|features)/})
20
+ end
21
+ spec.bindir = 'exe'
22
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
23
+ spec.require_paths = ['lib']
24
+
25
+ spec.add_development_dependency 'bundler', '~> 1.16'
26
+ spec.add_development_dependency 'rake', '~> 10.0'
27
+
28
+ spec.add_dependency 'em-http-request', '~> 1.1', '>= 1.1.3'
29
+ spec.add_dependency 'nokogiri', '~> 1.6', '>= 1.6.7'
30
+ spec.add_dependency 'rchardet', '~> 1.6', '>= 1.6.1'
31
+ end
data/spider_example.rb ADDED
@@ -0,0 +1,29 @@
1
+ require 'list_spider'
2
+ # require File.expand_path('../lib/list_spider', __FILE__)
3
+
4
+ DOWNLOAD_DIR = 'coolshell/'.freeze
5
+
6
+ def parse_index_item(file_name)
7
+ content = File.read(file_name)
8
+ doc = Nokogiri::HTML(content)
9
+ list_group = doc.css('h2.entry-title')
10
+ link_list = list_group.css('a')
11
+
12
+ link_list.each do |link|
13
+ href = link['href']
14
+ local_path = DOWNLOAD_DIR + link.content + '.html'
15
+ ListSpider.add_task(TaskStruct.new(href, local_path))
16
+ end
17
+ end
18
+
19
+ # ListSpider.convert_to_utf8 = true
20
+
21
+ # get_one is a simple function for one taskstruct situation
22
+ ListSpider.get_one(
23
+ TaskStruct.new(
24
+ 'https://coolshell.cn/',
25
+ DOWNLOAD_DIR + 'index.html',
26
+ parse_method: method(:parse_index_item)
27
+ ),
28
+ max: 60
29
+ )
@@ -0,0 +1,29 @@
1
+ require 'list_spider'
2
+
3
+ DOWNLOAD_DIR = 'coolshell/'.freeze
4
+
5
+ @next_list = []
6
+
7
+ def parse_index_item(file_name)
8
+ content = File.read(file_name)
9
+ doc = Nokogiri::HTML(content)
10
+ list_group = doc.css('h2.entry-title')
11
+ link_list = list_group.css('a')
12
+
13
+ link_list.each do |link|
14
+ href = link['href']
15
+ local_path = DOWNLOAD_DIR + link.content + '.html'
16
+ # or you can save them to database for later use
17
+ @next_list << TaskStruct.new(href, local_path)
18
+ end
19
+ end
20
+
21
+ task_list = []
22
+ task_list << TaskStruct.new(
23
+ 'https://coolshell.cn/',
24
+ DOWNLOAD_DIR + 'index.html',
25
+ parse_method: method(:parse_index_item)
26
+ )
27
+
28
+ ListSpider.get_list(task_list)
29
+ ListSpider.get_list(@next_list, max: 60)
metadata CHANGED
@@ -1,15 +1,43 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: list_spider
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.6
4
+ version: 1.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Charles Zhang
8
8
  autorequire:
9
- bindir: bin
9
+ bindir: exe
10
10
  cert_chain: []
11
- date: 2017-04-24 00:00:00.000000000 Z
11
+ date: 2018-01-29 00:00:00.000000000 Z
12
12
  dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.16'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.16'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
13
41
  - !ruby/object:Gem::Dependency
14
42
  name: em-http-request
15
43
  requirement: !ruby/object:Gem::Requirement
@@ -71,14 +99,27 @@ dependencies:
71
99
  - !ruby/object:Gem::Version
72
100
  version: 1.6.1
73
101
  description: A url list spider based on em-http-request.
74
- email: gis05zc@163.com
102
+ email:
103
+ - gis05zc@163.com
75
104
  executables: []
76
105
  extensions: []
77
106
  extra_rdoc_files: []
78
107
  files:
108
+ - ".gitignore"
109
+ - ".rubocop.yml"
110
+ - Gemfile
111
+ - README.md
112
+ - Rakefile
113
+ - bin/console
114
+ - bin/setup
115
+ - check_code.sh
79
116
  - lib/file_filter.rb
80
117
  - lib/list_spider.rb
118
+ - lib/list_spider/version.rb
81
119
  - lib/spider_helper.rb
120
+ - list_spider.gemspec
121
+ - spider_example.rb
122
+ - spider_example_2.rb
82
123
  homepage: https://github.com/chinazhangchao/list_spider
83
124
  licenses:
84
125
  - MIT
@@ -99,7 +140,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
99
140
  version: '0'
100
141
  requirements: []
101
142
  rubyforge_project:
102
- rubygems_version: 2.6.6
143
+ rubygems_version: 2.7.3
103
144
  signing_key:
104
145
  specification_version: 4
105
146
  summary: List Spider