the_scrap 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 3b6dbb1e2bbe11284969c7a5bcc79e3bba665b96
4
+ data.tar.gz: acfc7d5ac75f238fc77c578b52a68954e71b64c7
5
+ SHA512:
6
+ metadata.gz: b14dd5d813c97c2a4c9f8c954d8d523004b3d3472e502c8e881301d4213d1539e417da1fb19ca0b864335356065e9b23d8ec8f546b4e464649f92a0814005ec9
7
+ data.tar.gz: c989f8ccca09cdef3f892ca78264f9c53c8ef52beaa0020e3482109625e92a14f6e3a49c0b64131cb7a174688ce4734272e0b8c59cd6052ee98e130e3c5fc4b0
data/.gitignore ADDED
@@ -0,0 +1,22 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
18
+ *.bundle
19
+ *.so
20
+ *.o
21
+ *.a
22
+ mkmf.log
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in the_scrap.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2014 H.J.LeoChen
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,435 @@
1
+ ## The Scrap
2
+
3
+ The Scrap 是一个基于Nokogiri的网页数据抓取的框架
4
+
5
+ 目标是使用简单、高效、高自定义、高适配性。
6
+
7
+ ## Installation
8
+
9
+ Add this line to your application's Gemfile:
10
+
11
+ gem 'the_scrap'
12
+
13
+ And then execute:
14
+
15
+ $ bundle
16
+
17
+ Or install it yourself as:
18
+
19
+ $ gem install the_scrap
20
+
21
+ ## Usage
22
+ ### 0. 全景
23
+
24
+
25
+ ```ruby
26
+
27
+ # encoding: utf-8
28
+ require 'rubygems'
29
+ require 'the_scrap'
30
+ require 'pp'
31
+
32
+ #create Object
33
+ scrap = TheScrap::ListObj.new
34
+
35
+ #set start url
36
+ scrap.url = "http://fz.ganji.com/shouji/"
37
+
38
+ #fragment css selector
39
+ #表示,表格的每一行,或者列表的每个元素
40
+ #这个行或者元素里面应该包含这条记录的详细信息
41
+ #详细信息通过attr列表来获取。
42
+ scrap.item_frag = ".layoutlist .list-bigpic"
43
+
44
+ #scrap attr list
45
+ scrap.attr_name = ['.ft-tit',:inner_html]
46
+ scrap.attr_detail_url = ['.ft-tit',:href]
47
+ scrap.attr_img = ['dt a img',:src]
48
+ scrap.attr_desc = '.feature p'
49
+ scrap.attr_price = '.fc-org'
50
+
51
+ #debug
52
+ scrap.debug = true
53
+ scrap.verbose = true
54
+
55
+
56
+ #html preprocess
57
+ scrap.html_proc << lambda { |html|
58
+ #html.gsub(/abcd/,'efgh')
59
+ }
60
+
61
+ #filter scraped item
62
+ scrap.item_filters << lambda { |item_info|
63
+ return false if item_info['name'].nil? || item_info['name'].length == 0
64
+ return true
65
+ }
66
+
67
+ #data process
68
+ scrap.data_proc << lambda {|url,i|
69
+ i['name'] = i['name'].strip
70
+ }
71
+
72
+ #result process
73
+ scrap.result_proc << lambda {|url,items|
74
+ items.each do |item|
75
+ pp item
76
+ end
77
+ }
78
+
79
+ ##### 此处可以添加 多页分页 抓取功能 参见 2
80
+
81
+ ##### 此处可以添加 详细信息页面 抓取功能 参见 3
82
+
83
+ #scrap
84
+ scrap.scrap_list
85
+
86
+ ```
87
+
88
+ ### 1. 列表抓取
89
+
90
+ 参考上一节
91
+
92
+ ### 2. 多页列表抓取
93
+
94
+ ```ruby
95
+
96
+ #create ListObj
97
+
98
+ #...
99
+
100
+ ########### has many pages ###########
101
+ #如果设置了可以根据不同的分页方式抓取多页列表
102
+
103
+ scrap.has_many_pages = true
104
+
105
+ #next page link
106
+
107
+ # [:next_page, :total_pages, :total_records]
108
+
109
+
110
+ #:next_page
111
+ scrap.page_method = :next_page
112
+ scrap.next_page_css = ".next_page a"
113
+
114
+
115
+ #:total_page
116
+ scrap.page_method = :total_pages
117
+ scrap.get_page_count = lambda { |doc|
118
+ if doc.css('.total_p[age').text =~ /(\d+)页/
119
+ $~[1].to_i
120
+ else
121
+ 0
122
+ end
123
+ }
124
+
125
+ scrap.get_next_url = lambda { |url,next_page_number|
126
+ #url is http://fz.ganji.com/shouji/
127
+ #page url pattern http://fz.ganji.com/shouji/o#{page_number}/
128
+ url += "/o#{next_page_number}"
129
+ }
130
+
131
+ #**total_record in progress
132
+ scrap.page_method = :total_records
133
+ #...
134
+
135
+ scrap.scrap_list
136
+
137
+ ```
138
+
139
+ ### 3. 带详细页面信息提取
140
+
141
+ **如果DetailObj不是单独运行而是在ListObj中运行,抓取的信息将合并到ListObj的结果中去**
142
+
143
+ ```ruby
144
+
145
+ #create ListObj
146
+
147
+ #extra detail page url
148
+ scrap.attr_detail_url = [".list a",:href]
149
+
150
+ ...
151
+
152
+ ################# has detail page ################
153
+ #如果设置了可以根据之前抓取的详细页面URL获取详细页面信息
154
+
155
+ #1. define a detail object
156
+ scrap_detail = TheScrap::DetailObj.new
157
+ scrap_detail.attr_title = ".Tbox h3"
158
+ scrap_detail.attr_detail = ".Tbox .newsatr"
159
+ scrap_detail.attr_content = [".Tbox .view",:inner_html]
160
+
161
+
162
+ #optional html preprocess
163
+ scrap_detail.html_proc << lambda{ |response|
164
+ }
165
+
166
+ #optional data process
167
+ scrap_detail.data_proc << lambda {|url,i|
168
+ }
169
+
170
+ #optional result process
171
+ #此处可选,抓取的信息将合并到列表页面抓取的记录中去,也可以单独入库了。
172
+ scrap_detail.result_proc << lambda {|url,items|
173
+ }
174
+
175
+ #get url from list attr and extra data by scrap_detail
176
+ scrap.detail_info << [scrap_detail,'detail_url']
177
+
178
+ #scrap.detail_info << [scrap_detail_1,'detail_url_1']
179
+
180
+ #...
181
+
182
+ scrap.scrap_list
183
+
184
+ ```
185
+
186
+
187
+ ### 4. 元素属性说明
188
+
189
+ 元素属性使用 **scrap.attr_#{元素名称} = 规则** 来表示
190
+
191
+ **抓取后将全部放到一个Hash中,其中“元素名称”为Hash的Key,获取的数据为Hash的值**
192
+
193
+
194
+
195
+ scrap.attr_name = ".title"
196
+
197
+ 则结果item['name'] = ".title 对应的节点内容"
198
+
199
+ 其中规则可以使用多种方式表示
200
+
201
+ #### 4.1 直接使用CSS Selector
202
+
203
+ 直接使用CSS Selector的情况下,则取得CSS节点对应的 文本内容(inner_text)
204
+
205
+ ```ruby
206
+ @book_info.attr_author = "#divBookInfo .title a"
207
+ ```
208
+
209
+ #### 4.2 一个数组
210
+
211
+ scrap.attr_name = [css_selector,attrs]
212
+
213
+ 其中数值的第一个元素为: css_selector
214
+
215
+ 第二个元素可选值为:
216
+
217
+ **:frag_attr**
218
+
219
+ 直接去Fragmengt的属性,如list的属性,因为在实际使用过程中遇到过需要取列表或表格行的某个属性的情况。
220
+
221
+ scrap.attr_name = [:frag_attr,'href']
222
+
223
+ 数组第一个元素为frag_attr而非css selector因为css selector 已经在 scrap.item_frag 中指定,此为特例仅此一处出现此用法。
224
+
225
+ **:inner_html**
226
+
227
+ 取节点内的html
228
+
229
+ **:join**
230
+
231
+ 遇到某个list时,需要把里面的元素全部获取并使用逗号分隔。如:tags
232
+
233
+ ```html
234
+ <ul class="tags">
235
+ <li>ruby</li>
236
+ <li>rails</li>
237
+ <li>activerecord</li>
238
+ </ul>
239
+ ```
240
+
241
+ ```ruby
242
+ scrap.attr_name = ['.tags', :join]
243
+ ```
244
+
245
+ 使用上述取得一个字符串:
246
+
247
+ ```ruby
248
+ "ruby,rails,activerecord"
249
+ ```
250
+
251
+ **:array**
252
+
253
+ 遇到某个list时,需要把里面的元素全部获取并返回一个Array
254
+
255
+ ```html
256
+ <ul class="tags">
257
+ <li>ruby</li>
258
+ <li>rails</li>
259
+ <li>activerecord</li>
260
+ </ul>
261
+ ```
262
+
263
+ ```ruby
264
+ scrap.attr_name = ['.tags', :array]
265
+ ```
266
+
267
+ 使用上述取得一个字数组:
268
+
269
+ ```ruby
270
+ ['ruby','rails','activerecord']
271
+ ```
272
+
273
+ **:src**
274
+
275
+ 取得图片的SRC属性,并且使用URI.join(current_page_url,src_value)
276
+
277
+ **:href**
278
+
279
+ 取得链接的href属性,并且使用URI.join(current_page_url,href_value)
280
+
281
+ **"else"**
282
+
283
+ 直接获取元素属性的,不做任何其他处理。
284
+
285
+
286
+ **实例**
287
+
288
+ ```ruby
289
+ @book_info = TheScrap::DetailObj.new
290
+ @book_info.attr_name = "#divBookInfo .title h1"
291
+ @book_info.attr_author = "#divBookInfo .title a"
292
+ @book_info.attr_desc = [".intro .txt",:inner_html]
293
+ @book_info.attr_pic_url = ['.pic_box a img',:src]
294
+ @book_info.attr_chapters_url = ['.book_pic .opt li[1] a',:href]
295
+ @book_info.attr_book_info = ".info_box table tr"
296
+ @book_info.attr_cat_1 = '.box_title .page_site a[2]'
297
+ @book_info.attr_tags = ['.book_info .other .labels .box[1] a',:array]
298
+ @book_info.attr_user_tags = ['.book_info .other .labels .box[2] a',:join]
299
+ @book_info.attr_rate = '#bzhjshu'
300
+ @book_info.attr_rate_cnt = ["#div_pingjiarenshu",'title']
301
+ @book_info.attr_last_updated_at ="#divBookInfo .tabs .right"
302
+ @book_info.attr_last_chapter = '.updata_cont .title a'
303
+ @book_info.attr_last_chapter_desc = ['.updata_cont .cont a',:inner_html]
304
+ ```
305
+
306
+ ### 5. 分页模式
307
+
308
+ 参考 2. 多页列表抓取
309
+
310
+ ### 6. 获取的记录处理方法
311
+
312
+ 可以多获取的结果进行处理后再执行入库操作:
313
+
314
+ 简单举例:
315
+
316
+ ```ruby
317
+ baidu.data_proc << lambda {|url,i|
318
+ i['title'] = i['title'].strip
319
+ if i['ori_url'] =~ /view.aspx\?id=(\d+)/
320
+ i['ori_id'] = $~[1].to_i
321
+ end
322
+
323
+ if i['detail'] =~ /发布时间:(.*?) /
324
+ i['updated_at'] = i['created_at'] = $~[1]
325
+ end
326
+
327
+ if i['detail'] =~ /来源:(.*?)作者:/
328
+ i['description'] = $~[1].strip
329
+ end
330
+
331
+ i.delete('detail')
332
+
333
+ i['content'].gsub!(/<script type="text\/javascript">.*?<\/script>/m,'');
334
+ i['content'].gsub!(/<style>.*?<\/style>/m,'');
335
+ i['content'].gsub!(/<img class="img_(sina|qq)_share".*?>/m,'');
336
+ if i['content'] =~ /image=(.*?)"/
337
+ #i['image'] = open($~[1]) if $~[1].length > 0
338
+ end
339
+
340
+ i['site_id'] = @site_id
341
+ i['cat_id'] = @cat_id
342
+
343
+ time = Time.parse(i['updated_at'])
344
+ prep = '['+time.strftime('%y%m%d')+']'
345
+ }
346
+ ```
347
+
348
+ ### 7. 结果处理
349
+
350
+ #### mysql
351
+ ```ruby
352
+ require 'active_record'
353
+ require 'mysql2'
354
+ require 'activerecord-import' #recommend
355
+
356
+
357
+ ActiveRecord::Base.establish_connection( :adapter => "mysql2", :host => "localhost",
358
+ :database => "test", :username => "test", :password => "" )
359
+
360
+ ActiveRecord::Base.record_timestamps = false
361
+ class Article < ActiveRecord::Base
362
+ validates :ori_id, :uniqueness => true
363
+ end
364
+
365
+ # OR load Rails env!
366
+
367
+ scrap.result_proc << lambda {|url,items|
368
+ articles = []
369
+ items.each do |item|
370
+ #item[:user_id] = 1
371
+ articles << Article.new(item)
372
+ end
373
+ Article.import articles
374
+ }
375
+ ```
376
+ #### mongodb
377
+
378
+ ```ruby
379
+ require 'mongoid'
380
+
381
+ Mongoid.load!("./mongoid.yml", :production)
382
+ Mongoid.allow_dynamic_fields = true
383
+
384
+ class Article
385
+ include Mongoid::Document
386
+ #....
387
+ end
388
+
389
+ # OR load Rails env!
390
+
391
+ scrap.result_proc << lambda {|url,items|
392
+ items.each do |item|
393
+ #item[:user_id] = 1
394
+ Article.create(item)
395
+ end
396
+ }
397
+ ```
398
+
399
+ ### json,xml...
400
+
401
+ ```ruby
402
+ #json
403
+ scrap.result_proc << lambda {|url,items|
404
+ File.open("xxx.json",'w').write(items.to_json)
405
+ }
406
+
407
+ #xml
408
+ scrap.result_proc << lambda {|url,items|
409
+ articles = []
410
+ items.each do |item|
411
+ articles << item.to_xml
412
+ end
413
+ file = File.open("xxx.xml",'w')
414
+ file.write('<articles>')
415
+ file.write(articles.join(''))
416
+ file.write('</articles>')
417
+ file.close
418
+ }
419
+ ```
420
+
421
+ ## TODO
422
+
423
+ 1. 多线程抓取
424
+ 2. 线程管理
425
+ 3. 完善文档
426
+
427
+
428
+ ## Contributing
429
+
430
+ 1. Fork it ( https://github.com/[my-github-username]/thescrap/fork )
431
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
432
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
433
+ 4. Push to the branch (`git push origin my-new-feature`)
434
+ 5. Create a new Pull Request
435
+
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ require "bundler/gem_tasks"
2
+
@@ -0,0 +1,41 @@
1
+ # encoding: utf-8
2
+ module TheScrap
3
+ class DetailObj < Scrap
4
+ def scrap( url, item_info )
5
+ return retryable(:tries => 3, :on => Timeout::Error) do
6
+ do_scrap(url,item_info)
7
+ end
8
+ end
9
+
10
+ def do_scrap( url, item_info )
11
+ html = open(url).read
12
+ html_proc.each do |dp|
13
+ html = dp.call(html)
14
+ end
15
+
16
+ doc = Nokogiri::HTML(html,nil,encoding)
17
+ get_attrs(url,doc,item_info)
18
+
19
+ #has detail page?
20
+ #可以递归下层
21
+ detail_info.each do |detail|
22
+ detail[0].scrap(item_info[detail[1]],item_info)
23
+ end
24
+
25
+ #proc data
26
+ data_proc.each do |dp|
27
+ dp.call(url,item_info)
28
+ end
29
+
30
+ #proc result
31
+ #此处可以单独指定对明细信息的入库处理
32
+ result_proc.each do |rp|
33
+ rp.call(url,[item_info])
34
+ end
35
+
36
+ pp item_info if debug?
37
+ return item_info
38
+ end
39
+ end
40
+ end
41
+
@@ -0,0 +1,100 @@
1
+ # encoding: utf-8
2
+ module TheScrap
3
+ class ListObj < Scrap
4
+ attr_accessor :item_filters #条目过滤
5
+ attr_accessor :has_many_pages #是否多页
6
+ attr_accessor :pager_method #分页模式
7
+ attr_accessor :next_page_css #下一页模式时取下一页链接的 css selector
8
+ attr_accessor :get_page_count #总页数模式时取总页数方法,不用CSS因为很可能需要重新处理数字。
9
+ attr_accessor :get_next_url #总页数模式时,下一页的URL生成方式,方法
10
+
11
+ def initialize()
12
+ super
13
+ @item_filters = []
14
+ end
15
+
16
+ def scrap( url )
17
+ items = []
18
+
19
+ html = open(url)
20
+ html_proc.each do |dp|
21
+ html = dp.call(html)
22
+ end
23
+
24
+ doc = Nokogiri::HTML(html,nil,encoding)
25
+ doc.css(item_frag).each do |item|
26
+
27
+ item_info = {}
28
+ get_attrs(url,item,item_info)
29
+
30
+ #filter items
31
+ need_skip = false
32
+ item_filters.each do |filter|
33
+ unless filter.call(item_info)
34
+ need_skip = true
35
+ break
36
+ end
37
+ end
38
+ next if need_skip
39
+
40
+ #has detail page?
41
+ detail_info.each do |detail|
42
+ detail[0].scrap(item_info[detail[1]],item_info)
43
+ end
44
+
45
+ #proc result
46
+ data_proc.each do |dp|
47
+ dp.call(url,item_info)
48
+ end
49
+
50
+ items << item_info
51
+
52
+ pp item_info if debug?
53
+ break if debug?
54
+ end
55
+
56
+ result_proc.each do |rp|
57
+ rp.call(url,items)
58
+ end
59
+
60
+ return doc,items
61
+ end
62
+
63
+ def scrap_list
64
+ doc,items = retryable(:tries => 3, :on => Timeout::Error) do
65
+ scrap(url)
66
+ end
67
+
68
+ return unless has_many_pages
69
+
70
+ #TODO Refactor it
71
+ next_page_url = nil
72
+ if pager_method == :next_page #有下一页连接的方式
73
+ while node = doc.css(next_page_css).first
74
+ next_page_url = URI.join(next_page_url||url,node['href']).to_s
75
+ puts next_page_url if verbose?
76
+ doc,items = retryable(:tries => 3, :on => Timeout::Error) do
77
+ scrap(next_page_url)
78
+ end
79
+ break if items.count == 0
80
+ break if debug?
81
+ end
82
+ elsif pager_method == :total_pages #可以获取总页数的方式,start by 1
83
+ page_cnt = get_page_count.call(doc)
84
+ (2..page_cnt).each do |idx|
85
+ next_page_url = get_next_url.call(url,idx)
86
+ puts next_page_url if verbose?
87
+ doc,items = retryable(:tries => 3, :on => Timeout::Error) do
88
+ scrap(next_page_url)
89
+ end
90
+ break if items.count == 0
91
+ break if debug?
92
+ end
93
+ elsif pager_method == :total_records
94
+ #TODO
95
+ #可以取到总条数的方式 , 其实也可以使用上一方式(总页数)实现,只是在外部先使用总条数计算一下总页数
96
+ end
97
+ end
98
+ end
99
+ end
100
+
@@ -0,0 +1,100 @@
1
+ # encoding: utf-8
2
+ require 'rubygems'
3
+ require 'nokogiri'
4
+ require 'open-uri'
5
+ require 'pp'
6
+ require 'timeout'
7
+
8
+ module TheScrap
9
+ class Scrap
10
+ attr_accessor :item_frag #条目
11
+ attr_accessor :url #起点URL
12
+ attr_accessor :base_url #图片,连接base url
13
+ attr_accessor :html_proc #获取页面html后的处理方法
14
+ attr_accessor :data_proc #抓取完内容后手工对数据进行加工
15
+ attr_accessor :result_proc #入库,文件生成等。
16
+ attr_accessor :detail_info #详细页面对象
17
+
18
+ attr_accessor :encoding
19
+
20
+ attr_accessor :debug
21
+ alias_method :debug?, :debug
22
+
23
+ attr_accessor :verbose
24
+ alias_method :verbose?, :verbose
25
+
26
+ def initialize()
27
+ @attrs = {}
28
+ @more_info = []
29
+ @debug = false
30
+ #@encoding = 'utf-8'
31
+ @result_proc = []
32
+ @detail_info = []
33
+ @data_proc = []
34
+ @html_proc = []
35
+ end
36
+
37
+ def retryable( options = {} )
38
+ opts = { :tries => 1, :on => Exception }.merge(options)
39
+
40
+ retry_exception, retries = opts[:on], opts[:tries]
41
+
42
+ begin
43
+ return yield
44
+ rescue retry_exception
45
+ if (retries -= 1) > 0
46
+ sleep 2
47
+ retry
48
+ else
49
+ raise
50
+ end
51
+ end
52
+ end
53
+
54
+ def method_missing( method_id, *arguments, &block )
55
+ if(method_id =~ /attr_(.*)=/)
56
+ name = $~[1]
57
+ @attrs[name] = arguments.first
58
+ end
59
+ end
60
+
61
+ protected
62
+ #TODO document
63
+ def get_attrs( url, doc, item_info )
64
+ @attrs.keys.each do |k|
65
+ unless @attrs[k].is_a? Array
66
+ item_info[k] = doc.css(@attrs[k]).text.strip
67
+ else
68
+ option = @attrs[k]
69
+ if option[0] == :frag_attr
70
+ item_info[k] = doc[option[1]]
71
+ next
72
+ end
73
+
74
+ node = doc.css(option[0]).first
75
+ next unless node
76
+ if(option[1] == :inner_html)
77
+ item_info[k] = node.inner_html
78
+ elsif(option[1] == :join)
79
+ item_info[k] = doc.css(option[0]).map{|i|i.text}.join(',')
80
+ elsif(option[1] == :array)
81
+ item_info[k] = doc.css(option[0]).map{|i|i.text}
82
+ else
83
+ if [:href,:src].include? option[1].to_sym
84
+ #why ???
85
+ src = node[option[1]].strip.gsub(" ","%20")
86
+ begin
87
+ item_info[k] = URI.join(base_url||url,src).to_s
88
+ rescue
89
+ item_info[k] = src.to_s
90
+ end
91
+ else
92
+ item_info[k] = node[option[1]].strip
93
+ end
94
+ end
95
+ end
96
+ end
97
+ end
98
+ end
99
+ end
100
+
@@ -0,0 +1,3 @@
1
+ module TheScrap
2
+ VERSION = "0.0.1"
3
+ end
data/lib/the_scrap.rb ADDED
@@ -0,0 +1,5 @@
1
+ require "the_scrap/version"
2
+ require "the_scrap/scrap"
3
+ require "the_scrap/list_obj"
4
+ require "the_scrap/detail_obj"
5
+
data/the_scrap.gemspec ADDED
@@ -0,0 +1,24 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'the_scrap/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "the_scrap"
8
+ spec.version = TheScrap::VERSION
9
+ spec.authors = ["H.J.LeoChen"]
10
+ spec.email = ["hjleochen@hotmail.com"]
11
+ spec.summary = %q{The webpage scrapping.}
12
+ spec.description = %q{The webpage scrapping based Nokogiri.}
13
+ spec.homepage = ""
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0")
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.6"
22
+ spec.add_development_dependency "rake"
23
+ spec.add_dependency "nokogiri"
24
+ end
metadata ADDED
@@ -0,0 +1,97 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: the_scrap
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - H.J.LeoChen
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-08-18 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ~>
18
+ - !ruby/object:Gem::Version
19
+ version: '1.6'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ~>
25
+ - !ruby/object:Gem::Version
26
+ version: '1.6'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: nokogiri
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ description: The webpage scrapping based Nokogiri.
56
+ email:
57
+ - hjleochen@hotmail.com
58
+ executables: []
59
+ extensions: []
60
+ extra_rdoc_files: []
61
+ files:
62
+ - .gitignore
63
+ - Gemfile
64
+ - LICENSE.txt
65
+ - README.md
66
+ - Rakefile
67
+ - lib/the_scrap.rb
68
+ - lib/the_scrap/detail_obj.rb
69
+ - lib/the_scrap/list_obj.rb
70
+ - lib/the_scrap/scrap.rb
71
+ - lib/the_scrap/version.rb
72
+ - the_scrap.gemspec
73
+ homepage: ''
74
+ licenses:
75
+ - MIT
76
+ metadata: {}
77
+ post_install_message:
78
+ rdoc_options: []
79
+ require_paths:
80
+ - lib
81
+ required_ruby_version: !ruby/object:Gem::Requirement
82
+ requirements:
83
+ - - '>='
84
+ - !ruby/object:Gem::Version
85
+ version: '0'
86
+ required_rubygems_version: !ruby/object:Gem::Requirement
87
+ requirements:
88
+ - - '>='
89
+ - !ruby/object:Gem::Version
90
+ version: '0'
91
+ requirements: []
92
+ rubyforge_project:
93
+ rubygems_version: 2.2.2
94
+ signing_key:
95
+ specification_version: 4
96
+ summary: The webpage scrapping.
97
+ test_files: []