links_crawler 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: c07bea8f84cb2543c550fb14f3d8482d7a960918
4
+ data.tar.gz: 11e05b8107486dfa22b004c97b867bd938ffffa7
5
+ SHA512:
6
+ metadata.gz: d0bceaa69b62924ff60f1cb180d6b56b4c1df58b74fee4f635302471cc4fb6b6d35e90cbde94ad9c1af80e101c9bfe3cadd823aa8b97ac9cb83e240f6fbb4c39
7
+ data.tar.gz: 7a012812983be0597d5787547fc75ed4a6b63e63955dd13de5b51f92ca8c9b904224766e0751132e193fe038669f6a2dd68192aaeab739136130dfd1db5a94e7
@@ -0,0 +1,24 @@
1
+ require 'net/http'
2
+
3
+ module LinksCrawler #products list, feedbacks list
4
+ def self.included(recipient)
5
+ recipient.class_eval do
6
+ include ModelInstanceMethods
7
+ end
8
+ end
9
+
10
+ module ModelInstanceMethods
11
+ def import(target, importer)
12
+ case importer.origin
13
+ when "eBay"
14
+ extend Ebay::Importer
15
+ target_at_and_for target, importer
16
+ when "iOffer"
17
+ extend Ioffer::Importer
18
+ target_at_and_for target, importer
19
+ end
20
+ end
21
+ end# instance methods
22
+ end
23
+
24
+
@@ -0,0 +1,72 @@
1
+ require 'net/http'
2
+
3
+ module LinksCrawler
4
+ module ModelInstanceMethods
5
+
6
+ def fetch_links(dname,path)
7
+ uri = "http://#{dname}#{path}" unless dname.include?("http")
8
+ uri = URI(uri)
9
+
10
+ Net::HTTP.start(uri.host, uri.port) do |http|
11
+ request = Net::HTTP::Get.new uri
12
+ res = http.request request
13
+ str_body = res.body # proper argument
14
+
15
+ relative_links = str_body.scan(/href="[^(http|\.\.|#)].*?"/)
16
+ relative_links_path = relative_links.collect do |relative_link|
17
+ relative_link.delete("\"")[5..-1]
18
+ end
19
+
20
+ abs_links = str_body.scan(/href="http.*?"/).select do |href|
21
+ href.include? dname
22
+ end
23
+ abs_links_path = abs_links.collect do |abs_href|
24
+ URI(abs_href.delete("\"")[5..-1]).path
25
+ end
26
+
27
+ mixed_links = (relative_links_path + abs_links_path).uniq
28
+ mixed_links.each do |k|
29
+ tmp = k #.delete("\"")[5..-1]
30
+ if path == '/'
31
+ @arr_links << tmp
32
+ else
33
+ @arr_links.unshift tmp if is_valid_path?(tmp)
34
+ end
35
+ end
36
+ end
37
+ end
38
+
39
+ def is_valid_path?(tmp)
40
+ !@arr_links.include?(tmp) && !tmp.include?("javascript") && !tmp.match(/css$/)
41
+ end
42
+
43
+ def traverse(dname)
44
+ root = '/'
45
+ fetch_links(dname,root)
46
+ #debugger
47
+ while link = @arr_links.shift
48
+ if !@traversed.include? link
49
+ @traversed << link
50
+ puts "dname #{dname} link #{link}"
51
+ begin
52
+ fetch_links(dname,link)
53
+ @arr_links.delete(link) # remove the link that having been traversed
54
+ rescue
55
+ @arr_links.delete(link) # remove the link that having been traversed
56
+ end
57
+ end
58
+ end
59
+ end
60
+
61
+
62
+ def search(dname)
63
+ @arr_links = []
64
+ @traversed = []
65
+ traverse(dname)
66
+ @traversed.each do |i|
67
+ puts "http://#{dname}" + i
68
+ end
69
+ end
70
+
71
+ end
72
+ end
@@ -0,0 +1,3 @@
1
+ module LinksCrawler
2
+ VERSION="0.0.1"
3
+ end
@@ -0,0 +1,20 @@
1
+ #-*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "links_crawler/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "links_crawler"
7
+ s.version = LinksCrawler::VERSION
8
+ s.platform = Gem::Platform::RUBY
9
+ s.authors = ["wenbo"]
10
+ s.email = ["yiyun6674@hotmail.com"]
11
+ s.homepage = ""
12
+ s.summary = %q{check how many links are available inside the website}
13
+ s.description = <<-EOF
14
+ check how many links are available inside the website
15
+ EOF
16
+
17
+ s.files = `git ls-files`.split("\n")
18
+ s.test_files = `git ls-files -- {test}/*`.split("\n")
19
+ s.require_paths = ["lib"]
20
+ end
metadata ADDED
@@ -0,0 +1,48 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: links_crawler
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - wenbo
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-06-02 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: |2
14
+ check how many links are available inside the website
15
+ email:
16
+ - yiyun6674@hotmail.com
17
+ executables: []
18
+ extensions: []
19
+ extra_rdoc_files: []
20
+ files:
21
+ - lib/links_crawler.rb
22
+ - lib/links_crawler/model_instance_methods.rb
23
+ - lib/links_crawler/version.rb
24
+ - links_crawler.gemspec
25
+ homepage: ''
26
+ licenses: []
27
+ metadata: {}
28
+ post_install_message:
29
+ rdoc_options: []
30
+ require_paths:
31
+ - lib
32
+ required_ruby_version: !ruby/object:Gem::Requirement
33
+ requirements:
34
+ - - '>='
35
+ - !ruby/object:Gem::Version
36
+ version: '0'
37
+ required_rubygems_version: !ruby/object:Gem::Requirement
38
+ requirements:
39
+ - - '>='
40
+ - !ruby/object:Gem::Version
41
+ version: '0'
42
+ requirements: []
43
+ rubyforge_project:
44
+ rubygems_version: 2.2.2
45
+ signing_key:
46
+ specification_version: 4
47
+ summary: check how many links are available inside the website
48
+ test_files: []