news_crawler 0.0.0.pre.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/bin/news_crawler +94 -0
- data/lib/news_crawler/autostart.rb +33 -0
- data/lib/news_crawler/config.rb +53 -0
- data/lib/news_crawler/crawler_module.rb +70 -0
- data/lib/news_crawler/default_config.yml +13 -0
- data/lib/news_crawler/default_sds.yml +1 -0
- data/lib/news_crawler/downloader.rb +112 -0
- data/lib/news_crawler/link_selector/same_domain_selector.rb +172 -0
- data/lib/news_crawler/nc_logger.rb +49 -0
- data/lib/news_crawler/storage/raw_data/mongo_storage.rb +77 -0
- data/lib/news_crawler/storage/raw_data/raw_data_engine.rb +67 -0
- data/lib/news_crawler/storage/raw_data.rb +74 -0
- data/lib/news_crawler/storage/url_queue/mongo_storage.rb +218 -0
- data/lib/news_crawler/storage/url_queue/url_queue_engine.rb +124 -0
- data/lib/news_crawler/storage/url_queue/url_queue_error.rb +28 -0
- data/lib/news_crawler/storage/url_queue.rb +150 -0
- data/lib/news_crawler/url_helper.rb +50 -0
- data/lib/news_crawler/utils/robots_patch.rb +34 -0
- data/lib/news_crawler.rb +47 -0
- metadata +203 -0
@@ -0,0 +1,50 @@
|
|
1
|
+
#--
|
2
|
+
# NewsCrawler - a website crawler
|
3
|
+
#
|
4
|
+
# Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
|
5
|
+
#
|
6
|
+
# This file is part of NewsCrawler.
|
7
|
+
#
|
8
|
+
# NewsCrawler is free software: you can redistribute it and/or modify
|
9
|
+
# it under the terms of the GNU General Public License as published by
|
10
|
+
# the Free Software Foundation, either version 3 of the License, or
|
11
|
+
# (at your option) any later version.
|
12
|
+
#
|
13
|
+
# NewsCrawler is distributed in the hope that it will be useful,
|
14
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
15
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
16
|
+
# GNU General Public License for more details.
|
17
|
+
#
|
18
|
+
# You should have received a copy of the GNU General Public License
|
19
|
+
# along with NewsCrawler. If not, see <http://www.gnu.org/licenses/>.
|
20
|
+
#++
|
21
|
+
|
22
|
+
module NewsCrawler
|
23
|
+
# Contains various method for processing url
|
24
|
+
module URLHelper
|
25
|
+
# produce true if 2 urls belong to same domain
|
26
|
+
# @param [ String ] url1 Url 1
|
27
|
+
# @param [ String ] url2 Url 2
|
28
|
+
# @return [ Boolean ] true if both url belong to same domain
|
29
|
+
def same_domain?(url1, url2)
|
30
|
+
p1 = get_url_path(url1)
|
31
|
+
p2 = get_url_path(url2)
|
32
|
+
d1 = p1[:domain].split('.').reverse
|
33
|
+
d2 = p2[:domain].split('.').reverse
|
34
|
+
d1.zip(d2).inject(true) do | mem, obj |
|
35
|
+
mem = mem && ((obj[0] == obj[1]) || (obj[0].nil? || obj[1].nil?))
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
# split URL into 3 parts: scheme, domain, path
|
40
|
+
# @param [ String ] url
|
41
|
+
# return [ Hash ] contains parts
|
42
|
+
def get_url_path(url)
|
43
|
+
pattern = /((?<scheme>(http|https)):\/\/)?(?<domain>[^\/]+)?(?<path>\/.*)?/
|
44
|
+
md = pattern.match(url)
|
45
|
+
{ :scheme => md[:scheme],
|
46
|
+
:domain => md[:domain],
|
47
|
+
:path => md[:path]}
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
#--
|
2
|
+
# NewsCrawler - a website crawler
|
3
|
+
#
|
4
|
+
# Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
|
5
|
+
#
|
6
|
+
# This file is part of NewsCrawler.
|
7
|
+
#
|
8
|
+
# NewsCrawler is free software: you can redistribute it and/or modify
|
9
|
+
# it under the terms of the GNU General Public License as published by
|
10
|
+
# the Free Software Foundation, either version 3 of the License, or
|
11
|
+
# (at your option) any later version.
|
12
|
+
#
|
13
|
+
# NewsCrawler is distributed in the hope that it will be useful,
|
14
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
15
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
16
|
+
# GNU General Public License for more details.
|
17
|
+
#
|
18
|
+
# You should have received a copy of the GNU General Public License
|
19
|
+
# along with NewsCrawler. If not, see <http://www.gnu.org/licenses/>.
|
20
|
+
#++
|
21
|
+
|
22
|
+
require 'robots'
|
23
|
+
require 'uri'
|
24
|
+
require 'singleton'
|
25
|
+
|
26
|
+
class Robots
|
27
|
+
include Singleton
|
28
|
+
attr_accessor :user_agent
|
29
|
+
|
30
|
+
def initialize
|
31
|
+
@user_agent = 'NewsCrawler'
|
32
|
+
@parsed = {}
|
33
|
+
end
|
34
|
+
end
|
data/lib/news_crawler.rb
ADDED
@@ -0,0 +1,47 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
#--
|
3
|
+
# NewsCrawler - a website crawler
|
4
|
+
#
|
5
|
+
# Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
|
6
|
+
#
|
7
|
+
# This file is part of NewsCrawler.
|
8
|
+
#
|
9
|
+
# NewsCrawler is free software: you can redistribute it and/or modify
|
10
|
+
# it under the terms of the GNU General Public License as published by
|
11
|
+
# the Free Software Foundation, either version 3 of the License, or
|
12
|
+
# (at your option) any later version.
|
13
|
+
#
|
14
|
+
# NewsCrawler is distributed in the hope that it will be useful,
|
15
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
+
# GNU General Public License for more details.
|
18
|
+
#
|
19
|
+
# You should have received a copy of the GNU General Public License
|
20
|
+
# along with NewsCrawler. If not, see <http://www.gnu.org/licenses/>.
|
21
|
+
#++
|
22
|
+
|
23
|
+
# TODO implement easy API
|
24
|
+
|
25
|
+
require 'news_crawler/autostart'
|
26
|
+
require 'news_crawler/config'
|
27
|
+
require 'news_crawler/downloader'
|
28
|
+
require 'news_crawler/link_selector/same_domain_selector'
|
29
|
+
|
30
|
+
NewsCrawler::Storage::RawData.set_engine(:mongo)
|
31
|
+
NewsCrawler::Storage::URLQueue.set_engine(:mongo)
|
32
|
+
|
33
|
+
include NewsCrawler::Storage
|
34
|
+
|
35
|
+
URLQueue.clear
|
36
|
+
|
37
|
+
# RawData.clear
|
38
|
+
# dwl = NewsCrawler::Downloader.new
|
39
|
+
# dwl.run
|
40
|
+
# #dwl.async.run
|
41
|
+
# #dwl.graceful_terminate
|
42
|
+
|
43
|
+
URLQueue.mark_all('NewsCrawler::LinkSelector::SameDomainSelector', "unprocessed")
|
44
|
+
|
45
|
+
puts "Raw entries: #{RawData.count}"
|
46
|
+
NewsCrawler::LinkSelector::SameDomainSelector.new
|
47
|
+
puts "URL entries: #{URLQueue.count}"
|
metadata
ADDED
@@ -0,0 +1,203 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: news_crawler
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.0.pre.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Hà Quang Dương
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2013-07-21 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: mongo
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - '>='
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.9'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - '>='
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.9'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: typhoeus
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ~>
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0.6'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ~>
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0.6'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: nokogiri
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ~>
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '1.5'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ~>
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '1.5'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: celluloid
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ~>
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0.14'
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ~>
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0.14'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: simpleconfig
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ~>
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '2.0'
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ~>
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '2.0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: robots
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ~>
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0.10'
|
90
|
+
type: :runtime
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - ~>
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0.10'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: simplecov
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - ~>
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0.7'
|
104
|
+
type: :development
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - ~>
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0.7'
|
111
|
+
- !ruby/object:Gem::Dependency
|
112
|
+
name: minitest
|
113
|
+
requirement: !ruby/object:Gem::Requirement
|
114
|
+
requirements:
|
115
|
+
- - ~>
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '5.0'
|
118
|
+
type: :development
|
119
|
+
prerelease: false
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
requirements:
|
122
|
+
- - ~>
|
123
|
+
- !ruby/object:Gem::Version
|
124
|
+
version: '5.0'
|
125
|
+
- !ruby/object:Gem::Dependency
|
126
|
+
name: mocha
|
127
|
+
requirement: !ruby/object:Gem::Requirement
|
128
|
+
requirements:
|
129
|
+
- - ~>
|
130
|
+
- !ruby/object:Gem::Version
|
131
|
+
version: '0.14'
|
132
|
+
type: :development
|
133
|
+
prerelease: false
|
134
|
+
version_requirements: !ruby/object:Gem::Requirement
|
135
|
+
requirements:
|
136
|
+
- - ~>
|
137
|
+
- !ruby/object:Gem::Version
|
138
|
+
version: '0.14'
|
139
|
+
- !ruby/object:Gem::Dependency
|
140
|
+
name: coveralls
|
141
|
+
requirement: !ruby/object:Gem::Requirement
|
142
|
+
requirements:
|
143
|
+
- - '>='
|
144
|
+
- !ruby/object:Gem::Version
|
145
|
+
version: '0'
|
146
|
+
type: :development
|
147
|
+
prerelease: false
|
148
|
+
version_requirements: !ruby/object:Gem::Requirement
|
149
|
+
requirements:
|
150
|
+
- - '>='
|
151
|
+
- !ruby/object:Gem::Version
|
152
|
+
version: '0'
|
153
|
+
description: A flexible, modular web crawler
|
154
|
+
email: contact@haqduong.net
|
155
|
+
executables:
|
156
|
+
- news_crawler
|
157
|
+
extensions: []
|
158
|
+
extra_rdoc_files: []
|
159
|
+
files:
|
160
|
+
- lib/news_crawler.rb
|
161
|
+
- lib/news_crawler/autostart.rb
|
162
|
+
- lib/news_crawler/config.rb
|
163
|
+
- lib/news_crawler/crawler_module.rb
|
164
|
+
- lib/news_crawler/downloader.rb
|
165
|
+
- lib/news_crawler/link_selector/same_domain_selector.rb
|
166
|
+
- lib/news_crawler/nc_logger.rb
|
167
|
+
- lib/news_crawler/storage/raw_data.rb
|
168
|
+
- lib/news_crawler/storage/raw_data/mongo_storage.rb
|
169
|
+
- lib/news_crawler/storage/raw_data/raw_data_engine.rb
|
170
|
+
- lib/news_crawler/storage/url_queue.rb
|
171
|
+
- lib/news_crawler/storage/url_queue/mongo_storage.rb
|
172
|
+
- lib/news_crawler/storage/url_queue/url_queue_engine.rb
|
173
|
+
- lib/news_crawler/storage/url_queue/url_queue_error.rb
|
174
|
+
- lib/news_crawler/url_helper.rb
|
175
|
+
- lib/news_crawler/utils/robots_patch.rb
|
176
|
+
- lib/news_crawler/default_config.yml
|
177
|
+
- lib/news_crawler/default_sds.yml
|
178
|
+
- bin/news_crawler
|
179
|
+
homepage: http://haqduong.github.io/news_crawler/
|
180
|
+
licenses:
|
181
|
+
- GPLv3
|
182
|
+
metadata: {}
|
183
|
+
post_install_message:
|
184
|
+
rdoc_options: []
|
185
|
+
require_paths:
|
186
|
+
- lib
|
187
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
188
|
+
requirements:
|
189
|
+
- - '>='
|
190
|
+
- !ruby/object:Gem::Version
|
191
|
+
version: 2.0.0
|
192
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
193
|
+
requirements:
|
194
|
+
- - '>'
|
195
|
+
- !ruby/object:Gem::Version
|
196
|
+
version: 1.3.1
|
197
|
+
requirements: []
|
198
|
+
rubyforge_project:
|
199
|
+
rubygems_version: 2.0.3
|
200
|
+
signing_key:
|
201
|
+
specification_version: 4
|
202
|
+
summary: News crawler
|
203
|
+
test_files: []
|