spider2 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. data/MIT-LICENSE +20 -0
  2. data/README +15 -0
  3. data/Rakefile +23 -0
  4. data/init.rb +3 -0
  5. data/install.rb +2 -0
  6. data/lib/generators/spider/spider_generator.rb +42 -0
  7. data/lib/generators/spider/templates/base_page.rb +6 -0
  8. data/lib/generators/spider/templates/base_page_spec.rb +13 -0
  9. data/lib/generators/spider/templates/index_page.rb +6 -0
  10. data/lib/generators/spider/templates/index_page_spec.rb +14 -0
  11. data/lib/generators/spider/templates/index_page_test.rb +10 -0
  12. data/lib/generators/spider/templates/list_page.rb +6 -0
  13. data/lib/generators/spider/templates/list_page_spec.rb +22 -0
  14. data/lib/generators/spider/templates/list_page_test.rb +10 -0
  15. data/lib/generators/spider/templates/show_page.rb +14 -0
  16. data/lib/generators/spider/templates/show_page_spec.rb +19 -0
  17. data/lib/generators/spider/templates/show_page_test.rb +10 -0
  18. data/lib/generators/spider/templates/site.rb +7 -0
  19. data/lib/generators/spider/templates/site_spec.rb +13 -0
  20. data/lib/generators/spider/templates/test.rb +10 -0
  21. data/lib/generators/spider_migration/spider_migration_generator.rb +11 -0
  22. data/lib/generators/spider_migration/templates/migration.rb +42 -0
  23. data/lib/spider/active_record_methods.rb +60 -0
  24. data/lib/spider/http.rb +43 -0
  25. data/lib/spider/page/filter.rb +132 -0
  26. data/lib/spider/page/label.rb +28 -0
  27. data/lib/spider/page/pagination.rb +142 -0
  28. data/lib/spider/page/proxy.rb +149 -0
  29. data/lib/spider/page/publish.rb +78 -0
  30. data/lib/spider/page/validation.rb +136 -0
  31. data/lib/spider/page.rb +759 -0
  32. data/lib/spider/site.rb +225 -0
  33. data/lib/spider/spider_page.rb +18 -0
  34. data/lib/spider/spider_page_label.rb +5 -0
  35. data/lib/spider/version.rb +3 -0
  36. data/lib/spider.rb +81 -0
  37. data/lib/tasks/spider_tasks.rake +86 -0
  38. data/test/spider_fu_test.rb +9 -0
  39. data/test/test_helper.rb +4 -0
  40. data/uninstall.rb +2 -0
  41. metadata +151 -0
@@ -0,0 +1,759 @@
1
+ # encoding: utf-8
2
+ class Spider::PageExistsAndDoneException < Exception; end
3
+
4
+ require "iconv"
5
+ require "digest/md5"
6
+ require "htmlentities"
7
+ # 从本质上讲,所有的WEB页面都是一个页面(Page)
8
+ # 每个页面拥有一些属性,比如(encoding,title,url)
9
+ # 每个页面有我们感兴趣的信息,我们需要提取出来
10
+ # 比如列表页面我们敢兴趣的是文章列表的URLS
11
+ # 而文章显示页面,我们感兴趣的是一些具体的文章属性
12
+ # 每个页面可以有一个 parent ,父页面(parent)
13
+ # 页面可以从父页面继承一些属性,比如(encoding)
14
+ # 按照这个原理,采集其实只需要一个类就可以实现
15
+ # 现在的实现方式,就是使用了以上原理
16
+ # 比如:
17
+ # Spider::Page.new('http://www.google.com') do |page|
18
+ # page.pages do |page|
19
+ # # 这里的page 仍然是一个 page 的实例 但是URL是 www.google.com 上分析得到的
20
+ # # 默认的分析方式是使用 Hpricot 的 search("a") 方法
21
+ # # 然后分析页面,并提取 labels ,保存在 page 里面
22
+ # page.label "title",page.title
23
+ # page.label "body",page.title
24
+ # page.label "author",page.doc.search("#author").innderHTML
25
+ # # 如果还需要子页面,那么我们还可以这样
26
+ # page.pages /^http\:\/\// do |page|
27
+ # # 得到子页面
28
+ # # 可以继续处理
29
+ # end
30
+ # end
31
+ # end
32
+ # 以上是事例代码
33
+
34
+ class Spider::Page
35
+
36
+ def self.coder
37
+ @coder ||= HTMLEntities.new
38
+ end
39
+
40
+ def coder
41
+ self.class.coder
42
+ end
43
+
44
+ def self.class_attribute(*args)
45
+ # class_attribute跟class_inheritable_accessor对待Array,Hash的方式还存在差异
46
+ # 现在还是得使用class_inheritable_accessor
47
+ class_inheritable_accessor *args # 目前还没找到更好的方式
48
+ end
49
+
50
+
51
+ extend ActiveModel::Callbacks
52
+
53
+
54
+ # define_model_callbacks :fetch,:save,:crawl
55
+ define_model_callbacks :fetch,:save,:crawl
56
+
57
+ attr_reader :url,:doc
58
+ attr_accessor :title,:content_length,:response,:histories,:encoding,:parent
59
+
60
+ class_attribute :options
61
+ self.options = {}
62
+ self.options[:example_url] ||= []
63
+
64
+
65
+ SEPARATOR = "<!-- PAGINATE SEPARATOR -->"
66
+
67
+ @@paginate_symbol = "--NEXTPAGE--"
68
+ cattr_accessor :paginate_symbol
69
+
70
+ def self.delay(seconds)
71
+ before_fetch do |page|
72
+ logger.debug "#{Time.now.to_i}:delay #{seconds}"
73
+ sleep seconds
74
+ logger.debug "#{Time.now.to_i}:after delay"
75
+ true # continue
76
+ end
77
+ end
78
+
79
+ def self.set_example_url(*urls)
80
+ self.options[:example_url] ||= []
81
+ self.options[:example_url] += urls
82
+ end
83
+
84
+ def self.example_url
85
+ self.options[:example_url]
86
+ end
87
+
88
+
89
+ def self.encoding(new_encoding)
90
+ self.options['encoding'] = new_encoding
91
+ end
92
+
93
+ def self.ignore_existing(ignore=true)
94
+ self.options[:ignore_existing] = ignore
95
+ end
96
+
97
+ def ignore_existing?
98
+ !!options[:ignore_existing]
99
+ end
100
+
101
+ class_attribute :attribute_names
102
+ self.attribute_names = []
103
+
104
+ class_attribute :site
105
+
106
+ # 设置属性
107
+ # 所有设置的属性可以使用
108
+ # attributes 方法获取
109
+ # 并且保存页面的时候,会进入SpiderLabel的数据库库表中
110
+ # 是用来指定这个页面上有哪儿些需要的数据
111
+ def self.define_attributes(*attributes)
112
+ attributes.each do |attribute|
113
+ define_attribute attribute
114
+ end
115
+ end
116
+
117
+ def self.define_attribute(attribute)
118
+ self.attribute_names << attribute
119
+ self.attribute_names.uniq!
120
+ self.attribute_names.compact!
121
+ attribute
122
+ end
123
+
124
+ # 返回属性
125
+ # 如果定义了
126
+ # 比如
127
+ # class ShowPage < Spider::Page
128
+ # define_attributes :title,:body
129
+ # def body
130
+ # "body"
131
+ # end
132
+ # end
133
+ # show_page = ShowPage.new "http://www.powerapple.com"
134
+ # show_page.attributes # => {:title=>nil,:body=>"body"}
135
+ def attributes(reload=false)
136
+ hash = {}
137
+ attribute_names.each do |name|
138
+ hash[name] = self[name]
139
+ end
140
+ hash
141
+ end
142
+
143
+ # 对 <base href="xxxx" /> 的标记进行快捷获取
144
+ def base_href
145
+ doc.at("base").try(:attributes).try(:[],"href")
146
+ end
147
+
148
+
149
+ # 从url的query string中分析得到params
150
+ def params
151
+ Rack::Utils.parse_query(uri.query).tap do |r|
152
+ # r.symbolize_keys!
153
+ r.each_pair do |key,value|
154
+ r[key.to_sym] = value
155
+ end
156
+ end
157
+ end
158
+
159
+ # 提供对 attributes 的快捷访问
160
+ def [](*args)
161
+ read_attribute *args
162
+ end
163
+
164
+ # 提供对 attributes 的快捷访问
165
+ def []=(name,value)
166
+ @attributes[name] = value
167
+ end
168
+
169
+ # 开始爬行
170
+ def crawl(force=false)
171
+
172
+ unless ignore_existing? || force
173
+ if Spider::SpiderPage.find_by_url(url)
174
+ logger.info "url: #{url} already exists. skip."
175
+ return
176
+ end
177
+ end
178
+
179
+ logger.debug "#{self} before crawl"
180
+ run_callbacks :crawl do
181
+ save
182
+ end
183
+ end
184
+
185
+ def self.site
186
+ if name =~ /^([^:]+)/
187
+ begin
188
+ self.site = "#{$1}::Site".constantize.instance
189
+ rescue Exception=>e
190
+ nil
191
+ end
192
+ end
193
+ end
194
+
195
+ def site
196
+ self.class.site
197
+ end
198
+
199
+
200
+ def self.separator
201
+ SEPARATOR
202
+ end
203
+
204
+ # 日志器
205
+ def logger
206
+ self.class.logger
207
+ end
208
+
209
+ # 日志器
210
+ def self.logger
211
+ Spider.logger
212
+ end
213
+
214
+ def request(*args)
215
+ run_callbacks(:fetch) do
216
+ fetch_content_from_url(*args)
217
+ end
218
+ end
219
+
220
+
221
+ # url 是必须的,其他的可以作为选项
222
+ def initialize(url,options={})
223
+ options.reverse_merge! :debug=>false
224
+ @options = options
225
+ @parent = options[:parent]
226
+ @encoding = options[:encoding] if options[:encoding]
227
+ @url = url.strip
228
+ @response = nil
229
+ @attributes = {}
230
+ end
231
+
232
+ # 是否调试
233
+ # 如果打开调试 , 系统会输出更多的信息
234
+ def debug?
235
+ @options[:debug]
236
+ end
237
+
238
+ # 是从哪儿个页上过来的
239
+ # parent_page.pages '/xxxx/yyy.html' do |page|
240
+ # page.parent # => parent_page
241
+ # end
242
+ def parent
243
+ @options[:parent]
244
+ end
245
+
246
+
247
+ alias_method :referer,:parent
248
+
249
+ # 返回一个array
250
+ # 包含所有的父页面
251
+ def parents
252
+ parents = []
253
+ i = self
254
+ while(p = i.parent)
255
+ if p
256
+ parents << p
257
+ i = p
258
+ else
259
+ break
260
+ end
261
+ end
262
+ parents.compact.reverse
263
+ end
264
+
265
+ alias_method :histories,:parents
266
+
267
+ # 默认的网页编码
268
+ # 优先级没有在 pages 方法中指定的高
269
+ # Spider::Site.register 'site' do |site|
270
+ # site.encoding = 'big5'
271
+ # site.pages 'http://www.google.com' do |page|
272
+ # page.encoding # => big5
273
+ # end
274
+ # site.pages 'http://www.baidu.com',:encoding=>"gbk" do |page|
275
+ # page.encoding # => gbk
276
+ # end
277
+ # end
278
+ def encoding
279
+ @encoding ||= (self.class.options['encoding'] || "utf-8")
280
+ end
281
+
282
+ def encoding=(encoding)
283
+ # 如果已经获取了内容了,得重置一下内容
284
+ if @doc || @content
285
+ @doc = @content = nil
286
+ end
287
+ @encoding = encoding
288
+ end
289
+
290
+
291
+ # all pictures on this page
292
+ # 返回页面中的所有图片
293
+ # page.images #=> ['http://www.google.com/logo.gif']
294
+ def images
295
+ imgs = []
296
+ elem = ndoc
297
+ elem = ndoc.search(options[:scope]) if options[:scope]
298
+ elem.search('img[@src]').each do |img|
299
+ src =n img.attributes['src'].value
300
+ imgs << src
301
+ end
302
+ imgs
303
+ end
304
+
305
+
306
+ def url=(new_url)
307
+ @doc = @content = nil
308
+ @attributes = {}
309
+ @attributes_loaded = false
310
+ @url = new_url
311
+ end
312
+
313
+
314
+ def fix_image_urls
315
+ doc.search('img[@src]').each do |img|
316
+ img.set_attribute 'src', fix_urls(img.attributes['src'])
317
+ end
318
+ end
319
+
320
+ # clone a new page
321
+ def clone
322
+ self.class.new url,site
323
+ end
324
+
325
+ # return all links in this page
326
+ # 返回页面中的所有链接
327
+ # :scope=>"#pagination"
328
+ # 如果指定了:scope,则会在document的子元素中查询
329
+ def links(options={},&block)
330
+ [].tap do |urls|
331
+ elem = ndoc
332
+ elem = ndoc.search(options[:scope].strip) if options[:scope] # strip 很重要,如果末尾有空格会导致错误
333
+ elem.search('a[@href]').each do |a|
334
+ url = a.attributes['href'].value.strip
335
+ urls << url unless url.empty? || url =~ /^\s*javascript:/
336
+ end
337
+ fix_urls(urls)
338
+ end
339
+ end
340
+
341
+ # 过滤
342
+ def label(method,options={})
343
+ object = send(method)
344
+ object = case object
345
+ when String
346
+ object
347
+ when Symbol
348
+ object
349
+ when Hpricot::Elements
350
+ object.inner_html
351
+ when Nokogiri::XML::Element
352
+ object.to_html
353
+ else
354
+ object
355
+ end
356
+ [options[:filter]].flatten.uniq.compact.each do |filter|
357
+ object = case filter
358
+ when :javascript
359
+ elem = Nokogiri::HTML.fragment(object)
360
+ elem.search("script").remove
361
+ elem.to_html
362
+ when :css
363
+ elem = Nokogiri::HTML.fragment(object)
364
+ elem.search("style").remove
365
+ elem.to_html
366
+ when :tags
367
+ object.gsub(/<.+?>/,"")
368
+ end
369
+ end
370
+ object
371
+ end
372
+
373
+ # 当前支持所有过滤
374
+ def filter(html,filters=[])
375
+ filters = [filters].flatten
376
+ filters.each do |f|
377
+ html = case f
378
+ when :javascript
379
+ elem = Nokogiri::HTML.fragment(html)
380
+ elem.search("script").remove
381
+ elem.to_html
382
+ when :css
383
+ elem = Nokogiri::HTML.fragment(html)
384
+ elem.search("style").remove
385
+ elem.to_html
386
+ when :tags
387
+ html.gsub(/<.+?>/,"")
388
+ else
389
+ html
390
+ end
391
+ end
392
+ html
393
+ end
394
+
395
+
396
+ # filters will all have a chance to process this page
397
+ # DEPRECATED
398
+ # 现在暂时没用了
399
+ def start
400
+ return
401
+ begin
402
+ # all done pages will not processed
403
+ raise Spider::PageExistsAndDoneException if Spider::SpiderPage.find_by_url_and_done(url,true)
404
+
405
+ rescue Exception=>e
406
+ logger.info "Exception(#{url}): #{e.message}"
407
+ logger.debug e.backtrace.join("\n")
408
+ end
409
+ end
410
+
411
+ # 页面内容
412
+ # 在没有调用这个方法之前
413
+ # page 是不会去获取 url 中的内容的
414
+ # page.url # 不会发送http请求的
415
+ # page.content # 这个时候才会发送http请求,获取页面内容
416
+ def content
417
+ @content ||= request
418
+ end
419
+
420
+ # 设置页面
421
+ def content=(content)
422
+ content = content.to_s
423
+ @content_length = content.size
424
+ @content = content
425
+ end
426
+
427
+ # 页面的标题
428
+ def title
429
+ begin
430
+ ndoc.search("title").inner_html
431
+ rescue
432
+ ''
433
+ end
434
+ end
435
+
436
+ # 返回一个Hpricot文档对象
437
+ def doc
438
+ begin
439
+ @doc ||= Hpricot(content)
440
+ rescue
441
+ @doc ||= Hpricot(content,:xml=>true)
442
+ end
443
+ end
444
+
445
+ # nokogiri doc
446
+ def ndoc
447
+ @ndoc ||= Nokogiri::HTML.fragment(content)
448
+ end
449
+
450
+ # 使用nokogiri的搜索方法
451
+ def nsearch(*args)
452
+ ndoc.search(*args)
453
+ end
454
+
455
+ # doc.search 的快捷方式
456
+ def search(*args)
457
+ doc.search(*args)
458
+ end
459
+
460
+ # 是否已经在系统中存在?
461
+ def exists?
462
+ !spider_page.nil?
463
+ end
464
+
465
+
466
+ def spider_page
467
+ Spider::SpiderPage.find_by_url(url)
468
+ end
469
+
470
+
471
+
472
+
473
+ # all Spider::Page instance
474
+ # if block given
475
+ # yield the block with page, title
476
+ # 这个是最重要的方法之一
477
+ # page.url = "http://www.google.com/home/index.html"
478
+ # page.pages 'office.html','http://www.google.com/sport.html','/other.html' do |page|
479
+ # # 上面三个被传递的URL
480
+ # page.url #=> http://www.google.com/home/office.html,http://www.google.com/sport.html','http://www.google.com/other.html'
481
+ # # 会以此传入以上url的page实例
482
+ # end
483
+ # 如果没有参数
484
+ # page.pages do |page|
485
+ # # 这里会在 http://www.google.com/home/index.html 上的所有连接进行遍历
486
+ # end
487
+ # pages 选项
488
+ #
489
+ # *filter*
490
+ # page.pages :filter=>/google/ do |page|
491
+ # # 只有符合 :filter 选项中的正则表达式的 url 才会被通过
492
+ # end
493
+ #
494
+ # *append*
495
+ # page.pages :append=>"http://www.google.com" do |page|
496
+ # # 会在已有的URL列表之后,添加上 :append 选项指定的 url
497
+ # # 此 参数 可以是一个 array
498
+ # end
499
+ #
500
+ # *skip_exists*
501
+ # 一个布尔值,默认为 true
502
+ # page.pages :skip_exists=>false do |page|
503
+ # # 任何已经被处理过的 url 不会被排除
504
+ # end
505
+ def pages(*args)
506
+ options = args.extract_options!
507
+ options.reverse_merge! :uniq=>true,:class=>self.class
508
+ links = args.empty? ? self.links(:scope=>options[:scope]) : fix_urls(args.flatten)
509
+ logger.info "links before filter: #{links.inspect}" if debug?
510
+ filter = options[:filter]
511
+ # filter
512
+ case filter
513
+ when nil
514
+ when Regexp
515
+ links = links.find_all{|l| l =~ filter }
516
+ when String
517
+ when Array
518
+ #links = links.find_all{|l| filters.find{|f| l =~ f }
519
+ end
520
+
521
+ # except
522
+ except = options[:except]
523
+ case except
524
+ when nil
525
+ when Regexp
526
+ links.reject!{|link| link =~ except }
527
+ when String
528
+ when Array
529
+ end
530
+
531
+ # append
532
+ #logger.info "links after filter: #{links.inspect}" if debug?
533
+
534
+ links += [options[:append]].flatten if options[:append]
535
+
536
+ #logger.info "links after append: #{links.inspect}" if debug?
537
+
538
+ links.uniq! if options[:uniq]
539
+
540
+ links.collect{|i| go(i,self.options.clone.merge(:parent=>self,:class=>options[:class])) }.tap do |pages|
541
+ if block_given?
542
+ pages.each do |page|
543
+ logger.debug "yield page: #{page.inspect}" if debug?
544
+ yield page
545
+ end
546
+ end
547
+ end
548
+ end
549
+
550
+
551
+ # save to database
552
+ # 保存进数据库 (SpiderPage类)
553
+ # 选项
554
+ # *save_labels*
555
+ # 是否同时将获得的 labels 保存进 数据库 (SpiderPageLabel类),默认为 true
556
+ # 一旦保存,下次调用 pages 方法的时候,会将该URL排除
557
+ def save(options={})
558
+ logger.info "saving page #{url}"
559
+ if exists?
560
+ page = spider_page
561
+ else
562
+ page = Spider::SpiderPage.new(options.merge(:url=>url,:site=>site.try(:id)))
563
+ end
564
+ page.content_length = content_length
565
+ #page.labels_hash = Digest::MD5.hexdigest(labels.to_yaml)
566
+ run_callbacks :save do
567
+ page.save unless debug?
568
+
569
+ # aotianlong:
570
+ # label 保存貌似没有多大意义,取消保存
571
+ #
572
+ # save labels
573
+ # attributes.each_pair do |name,value|
574
+ # label = page.labels.find_or_initialize_by_name name.to_s
575
+ # label.value = value.to_s
576
+ # label.save unless debug?
577
+ # end
578
+ page
579
+ end
580
+ end
581
+
582
+
583
+ # 获得所属的("频道")的名称
584
+ # site.pages 'http://www.baidu.com',:name=>"baidu" do |page|
585
+ # page.name # => "baidu"
586
+ # page.pages [] do |page|
587
+ # page.parent.name #=> "baidu"
588
+ # end
589
+ # end
590
+ def name
591
+ @options[:name]
592
+ end
593
+
594
+
595
+ def full_name
596
+ "#{site.name}.#{name}"
597
+ end
598
+
599
+
600
+ # 模拟通过此页面点击进别的页面
601
+ # class 参数可以是 :list_page, :ListPage , Module::ListPage
602
+ # 都可以识别
603
+ def go(url,options={})
604
+ url = fix_urls(url)
605
+ klass = options[:class] || self.class
606
+ klass = case klass
607
+ when Symbol,String
608
+ klass_name = klass.to_s.classify
609
+ unless klass_name =~ /::/
610
+ klass_name = "#{self.class.parent.name}::#{klass_name}"
611
+ end
612
+ klass_name.constantize
613
+ else
614
+ klass
615
+ end
616
+ klass.new url,{:parent=>self}.merge(options)
617
+ end
618
+
619
+ # 历史记录
620
+ def back(step=1)
621
+ end
622
+
623
+ def forward(step=1)
624
+ end
625
+
626
+
627
+ # download a file from internet
628
+ # example:
629
+ # Spider::Page.download "http://powerapple.com/logo.gif",:to=>"/tmp/logo.gif"
630
+ def self.download(url,options={})
631
+ options.reverse_merge! :to=>File.basename(url),:method=>:get,:params=>{}
632
+ options[:method] = :get unless [:get,:post,:delete,:head].include?(options[:method])
633
+ to = options.delete :to
634
+ response = Spider::Http.send(options[:method],url,options)
635
+ dir = File.dirname(to)
636
+ FileUtils.mkdir_p dir unless File.directory?(dir)
637
+ File.open(to,"w+"){|f| f.write response }
638
+ to
639
+ end
640
+
641
+ # copy file
642
+ # example:
643
+ # Spider::Page.cp "/tmp/test.gif","/tmp/test/test/test/test/test.gif"
644
+ # destination directory if not exists , it will create automatically
645
+ def self.cp(src,dest,options={})
646
+ if File.exists?(src)
647
+ dir = File.dirname dest
648
+ FileUtils.mkdir_p dir unless File.directory?(dir)
649
+ logger.debug "file cp : #{src} -> #{dest}"
650
+ FileUtils.cp src,dest
651
+ end
652
+ end
653
+
654
+
655
+ def write_attribute(name,value)
656
+ @attributes[name] = value
657
+ end
658
+
659
+ def read_attribute(name,reload = false)
660
+ begin
661
+ send(name) if respond_to?(name)
662
+ rescue Exception=>e
663
+ end
664
+ end
665
+
666
+ # 返回一个当前url的URI类实例
667
+ def uri
668
+ @uri ||= URI.parse(url)
669
+ end
670
+
671
+ def reload
672
+ self.url = url
673
+ request
674
+ end
675
+
676
+
677
+
678
+ private
679
+
680
+ # every "/event/xdfasdf.url" like url will change to
681
+ # http://host.com/event/xdfasdf.url
682
+ # url can be array or string
683
+ def fix_urls(url)
684
+ port = uri.port == 80 ? "" : ":#{uri.port}"
685
+ case url
686
+ when /^\//
687
+ # 不包含主机信息的绝对路径
688
+ #
689
+ # url.replace "#{uri.scheme}://#{uri.host}#{port}/#{url}"
690
+ url = URI.escape url
691
+ url.replace uri.merge(url).to_s
692
+ when /^http:\/\//i
693
+ # 完整的URL
694
+ url
695
+ when Array
696
+ url.collect!{|u| fix_urls u }
697
+ else
698
+ # 不包含主机信息的相对路径
699
+ url = URI.escape url
700
+ path = uri.merge(url).to_s
701
+ url.replace path
702
+ end
703
+ url
704
+ end
705
+
706
+ # this method to fetch content from url
707
+ # if you want edit user agent , and other http headers , you should edit this method.
708
+ def fetch_content_from_url(options={})
709
+ method = options.delete :method
710
+ method = :get unless [:get,:head,:delete,:post,:put].include? method
711
+ body = ''
712
+ begin
713
+ logger.info "fetch content from url: #{url},method: #{method},options: #{options.inspect}"
714
+ logger.debug "httparty options: #{Spider::Http.default_options.inspect}"
715
+ logger.debug "cookies: #{Spider::Http.cookies.inspect}"
716
+ response = Spider::Http.send(method,url,options)
717
+ @response = response
718
+ if RUBY_VERSION >= "1.9" # ruby 1.9 encoding problem
719
+ body = response.body.force_encoding("utf-8")
720
+ else
721
+ body = response.body
722
+ end
723
+ # rescue Interrupt=>e
724
+ # exit
725
+ rescue Exception=>e
726
+ #puts e.message
727
+ logger.error e.message
728
+ logger.error e.backtrace.join("\n")
729
+ end
730
+ @content = body.tap do |b|
731
+ if !encoding.blank? && !(encoding =~ /^utf(\-?)8$/i) && @response.content_type =~ /text/i
732
+ logger.info "iconv #{encoding} -> utf-8"
733
+ b.replace Iconv.iconv("utf-8//IGNORE","#{encoding}//IGNORE",b).join.to_s
734
+ end
735
+ @content_length = b.size
736
+ begin
737
+ code = @response.code
738
+ rescue Exception=>e
739
+ code = 0
740
+ end
741
+ logger.info "status:#{code},#{b.size} byte fetched."
742
+ end
743
+ @content
744
+ end
745
+
746
+
747
+ def fetch_content_from_url_with_cache(options={})
748
+ key = Digest::MD5.hexdigest(options.to_json.to_s) + "/" + Digest::MD5.hexdigest(url)
749
+ @content ||= Rails.cache.fetch key do
750
+ fetch_content_from_url_without_cache(options)
751
+ end
752
+ @content_length = @content.length
753
+ @content
754
+ end
755
+
756
+ # alias_method_chain :fetch_content_from_url,:cache
757
+
758
+
759
+ end