links_crawler 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: c07bea8f84cb2543c550fb14f3d8482d7a960918
4
+ data.tar.gz: 11e05b8107486dfa22b004c97b867bd938ffffa7
5
+ SHA512:
6
+ metadata.gz: d0bceaa69b62924ff60f1cb180d6b56b4c1df58b74fee4f635302471cc4fb6b6d35e90cbde94ad9c1af80e101c9bfe3cadd823aa8b97ac9cb83e240f6fbb4c39
7
+ data.tar.gz: 7a012812983be0597d5787547fc75ed4a6b63e63955dd13de5b51f92ca8c9b904224766e0751132e193fe038669f6a2dd68192aaeab739136130dfd1db5a94e7
@@ -0,0 +1,24 @@
1
+ require 'net/http'
2
+
3
+ module LinksCrawler #products list, feedbacks list
4
+ def self.included(recipient)
5
+ recipient.class_eval do
6
+ include ModelInstanceMethods
7
+ end
8
+ end
9
+
10
+ module ModelInstanceMethods
11
+ def import(target, importer)
12
+ case importer.origin
13
+ when "eBay"
14
+ extend Ebay::Importer
15
+ target_at_and_for target, importer
16
+ when "iOffer"
17
+ extend Ioffer::Importer
18
+ target_at_and_for target, importer
19
+ end
20
+ end
21
+ end# instance methods
22
+ end
23
+
24
+
@@ -0,0 +1,72 @@
1
+ require 'net/http'
2
+
3
+ module LinksCrawler
4
+ module ModelInstanceMethods
5
+
6
+ def fetch_links(dname,path)
7
+ uri = "http://#{dname}#{path}" unless dname.include?("http")
8
+ uri = URI(uri)
9
+
10
+ Net::HTTP.start(uri.host, uri.port) do |http|
11
+ request = Net::HTTP::Get.new uri
12
+ res = http.request request
13
+ str_body = res.body # proper argument
14
+
15
+ relative_links = str_body.scan(/href="[^(http|\.\.|#)].*?"/)
16
+ relative_links_path = relative_links.collect do |relative_link|
17
+ relative_link.delete("\"")[5..-1]
18
+ end
19
+
20
+ abs_links = str_body.scan(/href="http.*?"/).select do |href|
21
+ href.include? dname
22
+ end
23
+ abs_links_path = abs_links.collect do |abs_href|
24
+ URI(abs_href.delete("\"")[5..-1]).path
25
+ end
26
+
27
+ mixed_links = (relative_links_path + abs_links_path).uniq
28
+ mixed_links.each do |k|
29
+ tmp = k #.delete("\"")[5..-1]
30
+ if path == '/'
31
+ @arr_links << tmp
32
+ else
33
+ @arr_links.unshift tmp if is_valid_path?(tmp)
34
+ end
35
+ end
36
+ end
37
+ end
38
+
39
+ def is_valid_path?(tmp)
40
+ !@arr_links.include?(tmp) && !tmp.include?("javascript") && !tmp.match(/css$/)
41
+ end
42
+
43
+ def traverse(dname)
44
+ root = '/'
45
+ fetch_links(dname,root)
46
+ #debugger
47
+ while link = @arr_links.shift
48
+ if !@traversed.include? link
49
+ @traversed << link
50
+ puts "dname #{dname} link #{link}"
51
+ begin
52
+ fetch_links(dname,link)
53
+ @arr_links.delete(link) # remove the link that having been traversed
54
+ rescue
55
+ @arr_links.delete(link) # remove the link that having been traversed
56
+ end
57
+ end
58
+ end
59
+ end
60
+
61
+
62
+ def search(dname)
63
+ @arr_links = []
64
+ @traversed = []
65
+ traverse(dname)
66
+ @traversed.each do |i|
67
+ puts "http://#{dname}" + i
68
+ end
69
+ end
70
+
71
+ end
72
+ end
@@ -0,0 +1,3 @@
1
+ module LinksCrawler
2
+ VERSION="0.0.1"
3
+ end
@@ -0,0 +1,20 @@
1
+ #-*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "links_crawler/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "links_crawler"
7
+ s.version = LinksCrawler::VERSION
8
+ s.platform = Gem::Platform::RUBY
9
+ s.authors = ["wenbo"]
10
+ s.email = ["yiyun6674@hotmail.com"]
11
+ s.homepage = ""
12
+ s.summary = %q{check how many links are available inside the website}
13
+ s.description = <<-EOF
14
+ check how many links are available inside the website
15
+ EOF
16
+
17
+ s.files = `git ls-files`.split("\n")
18
+ s.test_files = `git ls-files -- {test}/*`.split("\n")
19
+ s.require_paths = ["lib"]
20
+ end
metadata ADDED
@@ -0,0 +1,48 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: links_crawler
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - wenbo
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-06-02 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: |2
14
+ check how many links are available inside the website
15
+ email:
16
+ - yiyun6674@hotmail.com
17
+ executables: []
18
+ extensions: []
19
+ extra_rdoc_files: []
20
+ files:
21
+ - lib/links_crawler.rb
22
+ - lib/links_crawler/model_instance_methods.rb
23
+ - lib/links_crawler/version.rb
24
+ - links_crawler.gemspec
25
+ homepage: ''
26
+ licenses: []
27
+ metadata: {}
28
+ post_install_message:
29
+ rdoc_options: []
30
+ require_paths:
31
+ - lib
32
+ required_ruby_version: !ruby/object:Gem::Requirement
33
+ requirements:
34
+ - - '>='
35
+ - !ruby/object:Gem::Version
36
+ version: '0'
37
+ required_rubygems_version: !ruby/object:Gem::Requirement
38
+ requirements:
39
+ - - '>='
40
+ - !ruby/object:Gem::Version
41
+ version: '0'
42
+ requirements: []
43
+ rubyforge_project:
44
+ rubygems_version: 2.2.2
45
+ signing_key:
46
+ specification_version: 4
47
+ summary: check how many links are available inside the website
48
+ test_files: []