rspider 0.8.4
Sign up to get free protection for your applications and to get access to all the features.
- data/Changelog +32 -0
- data/Rakefile +66 -0
- data/ToDo +19 -0
- data/bin/linkcheck.rb +37 -0
- data/bin/main.rb +41 -0
- data/conf/local.conf +23 -0
- data/lib/rspider.rb +34 -0
- data/lib/rspider/ConfParser.rb +149 -0
- data/lib/rspider/ContentStorage.rb +130 -0
- data/lib/rspider/DataWasher.rb +129 -0
- data/lib/rspider/Document.rb +100 -0
- data/lib/rspider/DocumentExtractor.rb +21 -0
- data/lib/rspider/HtmlTidy.rb +34 -0
- data/lib/rspider/Logger.rb +49 -0
- data/lib/rspider/MysqlUrlRelationStorage.rb +31 -0
- data/lib/rspider/MysqlUrlStorage.rb +107 -0
- data/lib/rspider/OptParser.rb +53 -0
- data/lib/rspider/RobotRules.rb +92 -0
- data/lib/rspider/SiteLocker.rb +45 -0
- data/lib/rspider/Spider.rb +324 -0
- data/lib/rspider/ThreadPool.rb +69 -0
- data/lib/rspider/UrlDispatcher.rb +59 -0
- data/lib/rspider/UrlScorer.rb +44 -0
- data/lib/rspider/UrlStorage.rb +44 -0
- data/lib/rspider/browser.rb +127 -0
- data/lib/rspider/cookie.rb +113 -0
- data/lib/rspider/links.rb +111 -0
- data/lib/rspider/mysql.rb +1131 -0
- data/sql/db.sql +90 -0
- metadata +73 -0
data/sql/db.sql
ADDED
@@ -0,0 +1,90 @@
|
|
1
|
+
-- MySQL dump 10.11
|
2
|
+
--
|
3
|
+
-- Host: localhost Database: sphider2
|
4
|
+
-- ------------------------------------------------------
|
5
|
+
-- Server version 5.0.51a-log
|
6
|
+
|
7
|
+
/*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
|
8
|
+
/*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
|
9
|
+
/*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
|
10
|
+
/*!40101 SET NAMES utf8 */;
|
11
|
+
/*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */;
|
12
|
+
/*!40103 SET TIME_ZONE='+00:00' */;
|
13
|
+
/*!40014 SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0 */;
|
14
|
+
/*!40014 SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0 */;
|
15
|
+
/*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='NO_AUTO_VALUE_ON_ZERO' */;
|
16
|
+
/*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */;
|
17
|
+
|
18
|
+
--
|
19
|
+
-- Table structure for table `htmls`
|
20
|
+
--
|
21
|
+
|
22
|
+
DROP TABLE IF EXISTS `htmls`;
|
23
|
+
SET @saved_cs_client = @@character_set_client;
|
24
|
+
SET character_set_client = utf8;
|
25
|
+
CREATE TABLE `htmls` (
|
26
|
+
`source` varchar(32) NOT NULL default '',
|
27
|
+
`url` varchar(255) NOT NULL default '',
|
28
|
+
`url_crc32` bigint(16) NOT NULL default '0',
|
29
|
+
`html` text,
|
30
|
+
`html_crc32` bigint(16) NOT NULL default '0',
|
31
|
+
`created` bigint(11) NOT NULL default '0',
|
32
|
+
`ukey` varchar(128) NOT NULL default '-',
|
33
|
+
PRIMARY KEY (`ukey`),
|
34
|
+
KEY `created` (`created`),
|
35
|
+
KEY `source` (`source`)
|
36
|
+
) ENGINE=MyISAM DEFAULT CHARSET=utf8;
|
37
|
+
SET character_set_client = @saved_cs_client;
|
38
|
+
|
39
|
+
--
|
40
|
+
-- Table structure for table `url_relations`
|
41
|
+
--
|
42
|
+
|
43
|
+
DROP TABLE IF EXISTS `url_relations`;
|
44
|
+
SET @saved_cs_client = @@character_set_client;
|
45
|
+
SET character_set_client = utf8;
|
46
|
+
CREATE TABLE `url_relations` (
|
47
|
+
`id` bigint(11) NOT NULL auto_increment,
|
48
|
+
`referer` varchar(255) NOT NULL default '-',
|
49
|
+
`url` varchar(255) NOT NULL default '-',
|
50
|
+
`referer_crc32` bigint(11) NOT NULL,
|
51
|
+
`url_crc32` bigint(11) NOT NULL,
|
52
|
+
PRIMARY KEY (`id`),
|
53
|
+
KEY `idx_referer_crc32` (`referer_crc32`),
|
54
|
+
KEY `idx_url_crc32` (`url_crc32`)
|
55
|
+
) ENGINE=MyISAM AUTO_INCREMENT=11089 DEFAULT CHARSET=utf8;
|
56
|
+
SET character_set_client = @saved_cs_client;
|
57
|
+
|
58
|
+
--
|
59
|
+
-- Table structure for table `urls`
|
60
|
+
--
|
61
|
+
|
62
|
+
DROP TABLE IF EXISTS `urls`;
|
63
|
+
SET @saved_cs_client = @@character_set_client;
|
64
|
+
SET character_set_client = utf8;
|
65
|
+
CREATE TABLE `urls` (
|
66
|
+
`source` varchar(32) NOT NULL default '',
|
67
|
+
`url` varchar(256) NOT NULL default '',
|
68
|
+
`added` bigint(11) default NULL,
|
69
|
+
`visited` bigint(11) default NULL,
|
70
|
+
`ukey` varchar(128) NOT NULL default '-',
|
71
|
+
`score` int(4) default NULL,
|
72
|
+
`errors` int(4) default '0',
|
73
|
+
`url_crc32` bigint(11) NOT NULL default '0',
|
74
|
+
PRIMARY KEY (`ukey`),
|
75
|
+
KEY `visited` (`visited`),
|
76
|
+
KEY `added` (`added`),
|
77
|
+
KEY `source` (`source`)
|
78
|
+
) ENGINE=MyISAM DEFAULT CHARSET=utf8;
|
79
|
+
SET character_set_client = @saved_cs_client;
|
80
|
+
/*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */;
|
81
|
+
|
82
|
+
/*!40101 SET SQL_MODE=@OLD_SQL_MODE */;
|
83
|
+
/*!40014 SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS */;
|
84
|
+
/*!40014 SET UNIQUE_CHECKS=@OLD_UNIQUE_CHECKS */;
|
85
|
+
/*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
|
86
|
+
/*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
|
87
|
+
/*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
|
88
|
+
/*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
|
89
|
+
|
90
|
+
-- Dump completed on 2008-09-08 16:22:18
|
metadata
ADDED
@@ -0,0 +1,73 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
rubygems_version: 0.9.4
|
3
|
+
specification_version: 1
|
4
|
+
name: rspider
|
5
|
+
version: !ruby/object:Gem::Version
|
6
|
+
version: 0.8.4
|
7
|
+
date: 2008-09-15 00:00:00 +08:00
|
8
|
+
summary: Web cralwer
|
9
|
+
require_paths:
|
10
|
+
- lib
|
11
|
+
email: xurenlu@gmail.com
|
12
|
+
homepage: http://www.162cm.com/
|
13
|
+
rubyforge_project: rspider
|
14
|
+
description:
|
15
|
+
autorequire:
|
16
|
+
default_executable:
|
17
|
+
bindir: bin
|
18
|
+
has_rdoc: true
|
19
|
+
required_ruby_version: !ruby/object:Gem::Version::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">"
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 0.0.0
|
24
|
+
version:
|
25
|
+
platform: ruby
|
26
|
+
signing_key:
|
27
|
+
cert_chain:
|
28
|
+
post_install_message:
|
29
|
+
authors:
|
30
|
+
- Renlu Xu
|
31
|
+
files:
|
32
|
+
- lib/rspider/UrlScorer.rb
|
33
|
+
- lib/rspider/UrlStorage.rb
|
34
|
+
- lib/rspider/mysql.rb
|
35
|
+
- lib/rspider/MysqlUrlRelationStorage.rb
|
36
|
+
- lib/rspider/Logger.rb
|
37
|
+
- lib/rspider/cookie.rb
|
38
|
+
- lib/rspider/RobotRules.rb
|
39
|
+
- lib/rspider/SiteLocker.rb
|
40
|
+
- lib/rspider/UrlDispatcher.rb
|
41
|
+
- lib/rspider/ThreadPool.rb
|
42
|
+
- lib/rspider/OptParser.rb
|
43
|
+
- lib/rspider/DataWasher.rb
|
44
|
+
- lib/rspider/HtmlTidy.rb
|
45
|
+
- lib/rspider/ContentStorage.rb
|
46
|
+
- lib/rspider/Spider.rb
|
47
|
+
- lib/rspider/DocumentExtractor.rb
|
48
|
+
- lib/rspider/browser.rb
|
49
|
+
- lib/rspider/links.rb
|
50
|
+
- lib/rspider/ConfParser.rb
|
51
|
+
- lib/rspider/MysqlUrlStorage.rb
|
52
|
+
- lib/rspider/Document.rb
|
53
|
+
- lib/rspider.rb
|
54
|
+
- sql/db.sql
|
55
|
+
- Changelog
|
56
|
+
- ToDo
|
57
|
+
- conf/local.conf
|
58
|
+
- Rakefile
|
59
|
+
test_files: []
|
60
|
+
|
61
|
+
rdoc_options: []
|
62
|
+
|
63
|
+
extra_rdoc_files: []
|
64
|
+
|
65
|
+
executables:
|
66
|
+
- linkcheck.rb
|
67
|
+
- main.rb
|
68
|
+
extensions: []
|
69
|
+
|
70
|
+
requirements: []
|
71
|
+
|
72
|
+
dependencies: []
|
73
|
+
|