list_spider 0.3.6 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: cc5222e3c9eff5b6009e41e29f146de7013ad0f0
4
- data.tar.gz: 20a2d9c24d07a8fc4c9b61440bd4cabe256ea0f3
2
+ SHA256:
3
+ metadata.gz: 197035f7521ba4c326c0181c7133afe4c5d7bacfc3246795dc32758dce40da64
4
+ data.tar.gz: 89d14776f4c041806b6b9e164b31e651d03746c74d83505d5a32c1aeeaa62aa2
5
5
  SHA512:
6
- metadata.gz: 83cb5db4b531c32cce3ae5bac6cca154f1b43c175c6659a48cf04c86b6353d1a1763a0500426f7f6b2eb370aec66e311af48727f581d087f5677ea3c345afec2
7
- data.tar.gz: fd3d2a42c7127b72396c98c189474b7431c5e92f874085c5d6939d765ec8105abf5d58d5ff6b6dc69ca31fb46f5cefb158ce13e8f828749f122b909182933035
6
+ metadata.gz: a1b38832345203ec036ff4f8e11fba1d92e8ec58674d05ef129784a9e274dcd03ef421fa3db6e38bc38d7bb1cf3c54b7d56cbb321a5340bbe197fe57099ed077
7
+ data.tar.gz: 43de7e093004c823abb3c51a053869fd294af7fee9f9724c499af572ead7d5ba79d7ab9bb16b2baae1e00a1d198f89fcfbbedc35f57a3a8ed00f7f785d40cbfc
data/.gitignore ADDED
@@ -0,0 +1,84 @@
1
+ *.gem
2
+ *.rbc
3
+ /.config
4
+ /coverage/
5
+ /InstalledFiles
6
+ /pkg/
7
+ /spec/reports/
8
+ /spec/examples.txt
9
+ /test/tmp/
10
+ /test/version_tmp/
11
+ /tmp/
12
+
13
+ ## Specific to RubyMotion:
14
+ .dat*
15
+ .repl_history
16
+ build/
17
+
18
+ ## Documentation cache and generated files:
19
+ /.yardoc/
20
+ /_yardoc/
21
+ /doc/
22
+ /rdoc/
23
+
24
+ ## Environment normalisation:
25
+ /.bundle/
26
+ /vendor/bundle
27
+ /lib/bundler/man/
28
+
29
+ # for a library or gem, you might want to ignore these files since the code is
30
+ # intended to run in multiple environments; otherwise, check them in:
31
+ # Gemfile.lock
32
+ # .ruby-version
33
+ # .ruby-gemset
34
+
35
+ # unless supporting rvm < 1.11.0 or doing something fancy, ignore this:
36
+ .rvmrc
37
+
38
+ .DS_Store
39
+ .AppleDouble
40
+ .LSOverride
41
+
42
+ # Icon must end with two \r
43
+ Icon
44
+
45
+
46
+ # Thumbnails
47
+ ._*
48
+
49
+ # Files that might appear in the root of a volume
50
+ .DocumentRevisions-V100
51
+ .fseventsd
52
+ .Spotlight-V100
53
+ .TemporaryItems
54
+ .Trashes
55
+ .VolumeIcon.icns
56
+
57
+ # Directories potentially created on remote AFP share
58
+ .AppleDB
59
+ .AppleDesktop
60
+ Network Trash Folder
61
+ Temporary Items
62
+ .apdisk
63
+
64
+ # Windows image file caches
65
+ Thumbs.db
66
+ ehthumbs.db
67
+
68
+ # Folder config file
69
+ Desktop.ini
70
+
71
+ # Recycle Bin used on file shares
72
+ $RECYCLE.BIN/
73
+
74
+ # Windows Installer files
75
+ *.cab
76
+ *.msi
77
+ *.msm
78
+ *.msp
79
+
80
+ # Windows shortcuts
81
+ *.lnk
82
+
83
+ rubocopresult
84
+ coolshell
data/.rubocop.yml ADDED
@@ -0,0 +1,48 @@
1
+ Metrics/LineLength:
2
+ Max: 120
3
+ Metrics/MethodLength:
4
+ Max: 50
5
+ Metrics/ParameterLists:
6
+ Max: 12
7
+ Metrics/AbcSize:
8
+ Max: 50
9
+ Metrics/CyclomaticComplexity:
10
+ Max: 10
11
+ Metrics/PerceivedComplexity:
12
+ Max: 10
13
+ Style/GuardClause:
14
+ MinBodyLength: 5
15
+ Style/AsciiComments:
16
+ Enabled: false
17
+ Style/Documentation:
18
+ Enabled: false
19
+ Lint/AmbiguousRegexpLiteral:
20
+ Enabled: false
21
+ Lint/DefEndAlignment:
22
+ AutoCorrect: true
23
+ Lint/EndAlignment:
24
+ AutoCorrect: true
25
+ Style/BracesAroundHashParameters:
26
+ Enabled: false
27
+ Style/ClassAndModuleChildren:
28
+ Enabled: false
29
+ Style/AutoResourceCleanup:
30
+ Enabled: true
31
+ Style/CollectionMethods:
32
+ Enabled: true
33
+ Style/Encoding:
34
+ Enabled: true
35
+ Style/MethodCalledOnDoEndBlock:
36
+ Enabled: true
37
+ Layout/MultilineAssignmentLayout:
38
+ Enabled: true
39
+ Style/OptionHash:
40
+ Enabled: true
41
+ Style/StringMethods:
42
+ Enabled: true
43
+ Style/SymbolArray:
44
+ Enabled: true
45
+ Style/NonNilCheck:
46
+ IncludeSemanticChanges: true
47
+ Style/Send:
48
+ Enabled: true
data/Gemfile ADDED
@@ -0,0 +1,6 @@
1
+ source 'https://rubygems.org'
2
+
3
+ git_source(:github) { |repo_name| "https://github.com/#{repo_name}" }
4
+
5
+ # Specify your gem's dependencies in list_spider.gemspec
6
+ gemspec
data/README.md ADDED
@@ -0,0 +1,186 @@
1
+ # list_spider
2
+
3
+ A url list spider based on em-http-request.
4
+
5
+ Many times we only need to spider by url list then parse them and spider again. This is for the purpose.
6
+
7
+ ## Features
8
+ * Duplicate url filtering (based on local path, so you can custom your behavior).
9
+
10
+ * Convert to UTF-8 support.
11
+
12
+ * Increased spider support (don't spider exist).
13
+
14
+ * Customize concurrent number and interval between task.
15
+
16
+ * Http options support.
17
+
18
+ ## Getting started
19
+
20
+ gem install list_spider
21
+
22
+ ## Use like this
23
+ ```ruby
24
+ require 'list_spider'
25
+
26
+ DOWNLOAD_DIR = 'coolshell/'
27
+
28
+ $next_list = []
29
+
30
+ def parse_index_item(file_name)
31
+ content = File.read(file_name)
32
+ doc = Nokogiri::HTML(content)
33
+ list_group = doc.css("h2.entry-title")
34
+ link_list = list_group.css("a")
35
+
36
+ link_list.each do |link|
37
+ href = link['href']
38
+ local_path = DOWNLOAD_DIR + link.content + ".html"
39
+ #or you can save them to database for later use
40
+ $next_list<< TaskStruct.new(href, local_path)
41
+ end
42
+ end
43
+
44
+ task_list = []
45
+ task_list << TaskStruct.new('https://coolshell.cn/', DOWNLOAD_DIR + 'index.html', parse_method: method(:parse_index_item))
46
+
47
+ ListSpider.get_list(task_list)
48
+ ListSpider.get_list($next_list, max: 60)
49
+
50
+ ```
51
+
52
+ ## Or in one step
53
+ ```ruby
54
+ require 'list_spider'
55
+
56
+ DOWNLOAD_DIR = 'coolshell/'
57
+
58
+ def parse_index_item(file_name)
59
+
60
+ content = File.read(file_name)
61
+ doc = Nokogiri::HTML(content)
62
+ list_group = doc.css("h2.entry-title")
63
+ link_list = list_group.css("a")
64
+
65
+ link_list.each do |link|
66
+ href = link['href']
67
+ local_path = DOWNLOAD_DIR + link.content + ".html"
68
+ ListSpider.add_task(TaskStruct.new(href, local_path))
69
+ end
70
+ end
71
+
72
+ #get_one is a simple function for one taskstruct situation
73
+ ListSpider.get_one(TaskStruct.new(
74
+ 'https://coolshell.cn/',
75
+ DOWNLOAD_DIR + 'index.html',
76
+ parse_method: method(:parse_index_item)),
77
+ max: 60)
78
+
79
+ ```
80
+
81
+ ## You can define parse method in four forms
82
+
83
+ ```ruby
84
+ def parse_response(file_name)
85
+ #...
86
+ end
87
+
88
+
89
+ # extra_data is passed by TaskStruct's extra_data param
90
+
91
+ def parse_response(file_name, extra_data)
92
+ #...
93
+ end
94
+
95
+
96
+ # response_header is a EventMachine::HttpResponseHeader object
97
+ # you can use it like this:
98
+ # response_header.status
99
+ # response_header.cookie
100
+ # response_header['Last-Modified']
101
+
102
+ def parse_response(file_name, extra_data, response_header)
103
+ response_header.status
104
+ response_header['Last-Modified']
105
+
106
+ #...
107
+ end
108
+
109
+ # req is a EventMachine::HttpClientOptions object
110
+ # you can use it like this:
111
+ # req.body
112
+ # req.headers
113
+ # req.uri
114
+ # req.host
115
+ # req.port
116
+ def parse_response(file_name, extra_data, response_header, req)
117
+ puts req.body
118
+ puts req.headers
119
+ puts req.uri
120
+ puts req.host
121
+ puts req.port
122
+
123
+ #...
124
+ end
125
+
126
+ ```
127
+
128
+ ## And there are many options you can use
129
+
130
+ ```ruby
131
+ TaskStruct.new(href, local_path, http_method: :get, params: {}, extra_data: nil, parse_method: nil, header: nil)
132
+ ```
133
+
134
+ ```ruby
135
+ #no concurrent limit (note: only use when list size is small)
136
+ ListSpider.get_list(down_list, interval: 0, max: ListSpider::NO_LIMIT_CONCURRENT)
137
+
138
+ #sleep random time, often used in site which limit spider
139
+ ListSpider.get_list(down_list, interval: ListSpider::RANDOM_TIME, max: 1)
140
+
141
+ #set random time range
142
+ ListSpider.get_list(down_list, interval: (1..10), max: 1)
143
+
144
+ ```
145
+
146
+ ###Options below will take effect in the whole program (set them before call get_list)
147
+
148
+ ```ruby
149
+ #set proxy
150
+ ListSpider.set_proxy(proxy_addr, proxy_port, username: nil, password: nil)
151
+
152
+ #set http header (if TaskStruct has header it will be used priority)
153
+ ListSpider.set_header_option(header_option)
154
+
155
+ #convert the file encoding to utf-8
156
+ ListSpider.convert_to_utf8 = true
157
+
158
+ #set connect timeout
159
+ ListSpider.connect_timeout = 2*60
160
+
161
+ #over write exist file
162
+ ListSpider.overwrite_exist = false
163
+
164
+ #set redirect depth
165
+ ListSpider.max_redirects = 10
166
+
167
+ ```
168
+
169
+ ## There is a util class to help check or delete unvalid file
170
+
171
+ ```ruby
172
+ FileFilter.delete(CustomConfig::DIR + '*', size_threshold: 300)
173
+
174
+ FileFilter.check(CustomConfig::DIR + '*', size_threshold: 300)
175
+
176
+ FileFilter.check_save_result(CustomConfig::DIR + '*', size_threshold: 300)
177
+
178
+ #params
179
+ FileFilter.delete(dir_pattern, size_threshold: 1000, cust_judge: nil)
180
+
181
+ FileFilter.check_save_result(dir_pattern, save_file_name: 'filtered_file.txt', size_threshold: 1000, cust_judge: nil)
182
+ ```
183
+
184
+ ### License
185
+
186
+ (MIT License) - Copyright (c) 2016 Charles Zhang
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ require 'bundler/gem_tasks'
2
+ task default: :spec
data/bin/console ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'bundler/setup'
4
+ require 'list_spider'
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require 'irb'
14
+ IRB.start(__FILE__)
data/bin/setup ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
data/check_code.sh ADDED
@@ -0,0 +1,3 @@
1
+ #!/bin/sh
2
+
3
+ rubocop -a -D -f simple -o rubocopresult
data/lib/list_spider.rb CHANGED
@@ -1,3 +1,4 @@
1
+ require 'list_spider/version'
1
2
  require 'em-http-request'
2
3
  require 'nokogiri'
3
4
  require 'fileutils'
@@ -108,7 +109,7 @@ module ListSpider
108
109
  end
109
110
  end
110
111
  succeed_list << e
111
- rescue => e
112
+ rescue StandardError => e
112
113
  puts e
113
114
  end
114
115
  end
@@ -122,7 +123,7 @@ module ListSpider
122
123
  if e.http_method == :get
123
124
  ret = SpiderHelper.direct_http_get(e.href, e.local_path, convert_to_utf8: @convert_to_utf8)
124
125
  elsif e.http_method == :post
125
- ret = SpiderHelper.direct_http_post(e.href, e.local_path, e.params, convert_to_utf8: @convert_to_utf8)
126
+ ret = SpiderHelper.direct_http_post(e.href, e.local_path, e.params, convert_to_utf8: @convert_to_utf8)
126
127
  end
127
128
 
128
129
  if ret
@@ -134,7 +135,7 @@ module ListSpider
134
135
 
135
136
  begin
136
137
  multi.add e.local_path, w
137
- rescue => exception
138
+ rescue StandardError => exception
138
139
  puts exception
139
140
  puts e.href
140
141
  puts e.local_path
@@ -248,7 +249,7 @@ module ListSpider
248
249
  end
249
250
 
250
251
  def get_list(down_list, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
251
- if interval.is_a?Range
252
+ if interval.is_a? Range
252
253
  @random_time_range = interval
253
254
  interval = RANDOM_TIME
254
255
  end
@@ -273,7 +274,7 @@ module ListSpider
273
274
  end
274
275
 
275
276
  def add_task(task)
276
- if task.is_a?Array
277
+ if task.is_a? Array
277
278
  need_down_list = filter_list(task)
278
279
  @down_list += need_down_list
279
280
  elsif task.is_a?TaskStruct
@@ -0,0 +1,3 @@
1
+ module ListSpider
2
+ VERSION = '1.0.0'.freeze
3
+ end
data/lib/spider_helper.rb CHANGED
@@ -27,7 +27,7 @@ module SpiderHelper
27
27
  else
28
28
  puts res
29
29
  end
30
- rescue => e
30
+ rescue StandardError => e
31
31
  puts e.backtrace
32
32
  puts e
33
33
  false
@@ -59,7 +59,7 @@ module SpiderHelper
59
59
  else
60
60
  puts res
61
61
  end
62
- rescue => e
62
+ rescue StandardError => e
63
63
  puts e
64
64
  false
65
65
  end
@@ -0,0 +1,31 @@
1
+
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'list_spider/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = 'list_spider'
8
+ spec.version = ListSpider::VERSION
9
+ spec.authors = ['Charles Zhang']
10
+ spec.email = ['gis05zc@163.com']
11
+
12
+ spec.summary = 'List Spider'
13
+ spec.description = 'A url list spider based on em-http-request.'
14
+ spec.homepage = 'https://github.com/chinazhangchao/list_spider'
15
+ spec.license = 'MIT'
16
+
17
+ spec.files =
18
+ `git ls-files -z`.split("\x0").reject do |f|
19
+ f.match(%r{^(test|spec|features)/})
20
+ end
21
+ spec.bindir = 'exe'
22
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
23
+ spec.require_paths = ['lib']
24
+
25
+ spec.add_development_dependency 'bundler', '~> 1.16'
26
+ spec.add_development_dependency 'rake', '~> 10.0'
27
+
28
+ spec.add_dependency 'em-http-request', '~> 1.1', '>= 1.1.3'
29
+ spec.add_dependency 'nokogiri', '~> 1.6', '>= 1.6.7'
30
+ spec.add_dependency 'rchardet', '~> 1.6', '>= 1.6.1'
31
+ end
data/spider_example.rb ADDED
@@ -0,0 +1,29 @@
1
+ require 'list_spider'
2
+ # require File.expand_path('../lib/list_spider', __FILE__)
3
+
4
+ DOWNLOAD_DIR = 'coolshell/'.freeze
5
+
6
+ def parse_index_item(file_name)
7
+ content = File.read(file_name)
8
+ doc = Nokogiri::HTML(content)
9
+ list_group = doc.css('h2.entry-title')
10
+ link_list = list_group.css('a')
11
+
12
+ link_list.each do |link|
13
+ href = link['href']
14
+ local_path = DOWNLOAD_DIR + link.content + '.html'
15
+ ListSpider.add_task(TaskStruct.new(href, local_path))
16
+ end
17
+ end
18
+
19
+ # ListSpider.convert_to_utf8 = true
20
+
21
+ # get_one is a simple function for one taskstruct situation
22
+ ListSpider.get_one(
23
+ TaskStruct.new(
24
+ 'https://coolshell.cn/',
25
+ DOWNLOAD_DIR + 'index.html',
26
+ parse_method: method(:parse_index_item)
27
+ ),
28
+ max: 60
29
+ )
@@ -0,0 +1,29 @@
1
+ require 'list_spider'
2
+
3
+ DOWNLOAD_DIR = 'coolshell/'.freeze
4
+
5
+ @next_list = []
6
+
7
+ def parse_index_item(file_name)
8
+ content = File.read(file_name)
9
+ doc = Nokogiri::HTML(content)
10
+ list_group = doc.css('h2.entry-title')
11
+ link_list = list_group.css('a')
12
+
13
+ link_list.each do |link|
14
+ href = link['href']
15
+ local_path = DOWNLOAD_DIR + link.content + '.html'
16
+ # or you can save them to database for later use
17
+ @next_list << TaskStruct.new(href, local_path)
18
+ end
19
+ end
20
+
21
+ task_list = []
22
+ task_list << TaskStruct.new(
23
+ 'https://coolshell.cn/',
24
+ DOWNLOAD_DIR + 'index.html',
25
+ parse_method: method(:parse_index_item)
26
+ )
27
+
28
+ ListSpider.get_list(task_list)
29
+ ListSpider.get_list(@next_list, max: 60)
metadata CHANGED
@@ -1,15 +1,43 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: list_spider
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.6
4
+ version: 1.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Charles Zhang
8
8
  autorequire:
9
- bindir: bin
9
+ bindir: exe
10
10
  cert_chain: []
11
- date: 2017-04-24 00:00:00.000000000 Z
11
+ date: 2018-01-29 00:00:00.000000000 Z
12
12
  dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.16'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.16'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
13
41
  - !ruby/object:Gem::Dependency
14
42
  name: em-http-request
15
43
  requirement: !ruby/object:Gem::Requirement
@@ -71,14 +99,27 @@ dependencies:
71
99
  - !ruby/object:Gem::Version
72
100
  version: 1.6.1
73
101
  description: A url list spider based on em-http-request.
74
- email: gis05zc@163.com
102
+ email:
103
+ - gis05zc@163.com
75
104
  executables: []
76
105
  extensions: []
77
106
  extra_rdoc_files: []
78
107
  files:
108
+ - ".gitignore"
109
+ - ".rubocop.yml"
110
+ - Gemfile
111
+ - README.md
112
+ - Rakefile
113
+ - bin/console
114
+ - bin/setup
115
+ - check_code.sh
79
116
  - lib/file_filter.rb
80
117
  - lib/list_spider.rb
118
+ - lib/list_spider/version.rb
81
119
  - lib/spider_helper.rb
120
+ - list_spider.gemspec
121
+ - spider_example.rb
122
+ - spider_example_2.rb
82
123
  homepage: https://github.com/chinazhangchao/list_spider
83
124
  licenses:
84
125
  - MIT
@@ -99,7 +140,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
99
140
  version: '0'
100
141
  requirements: []
101
142
  rubyforge_project:
102
- rubygems_version: 2.6.6
143
+ rubygems_version: 2.7.3
103
144
  signing_key:
104
145
  specification_version: 4
105
146
  summary: List Spider