news_crawler 0.0.0.pre.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,50 @@
1
+ #--
2
+ # NewsCrawler - a website crawler
3
+ #
4
+ # Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
5
+ #
6
+ # This file is part of NewsCrawler.
7
+ #
8
+ # NewsCrawler is free software: you can redistribute it and/or modify
9
+ # it under the terms of the GNU General Public License as published by
10
+ # the Free Software Foundation, either version 3 of the License, or
11
+ # (at your option) any later version.
12
+ #
13
+ # NewsCrawler is distributed in the hope that it will be useful,
14
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
+ # GNU General Public License for more details.
17
+ #
18
+ # You should have received a copy of the GNU General Public License
19
+ # along with NewsCrawler. If not, see <http://www.gnu.org/licenses/>.
20
+ #++
21
+
22
+ module NewsCrawler
23
+ # Contains various method for processing url
24
+ module URLHelper
25
+ # produce true if 2 urls belong to same domain
26
+ # @param [ String ] url1 Url 1
27
+ # @param [ String ] url2 Url 2
28
+ # @return [ Boolean ] true if both url belong to same domain
29
+ def same_domain?(url1, url2)
30
+ p1 = get_url_path(url1)
31
+ p2 = get_url_path(url2)
32
+ d1 = p1[:domain].split('.').reverse
33
+ d2 = p2[:domain].split('.').reverse
34
+ d1.zip(d2).inject(true) do | mem, obj |
35
+ mem = mem && ((obj[0] == obj[1]) || (obj[0].nil? || obj[1].nil?))
36
+ end
37
+ end
38
+
39
+ # split URL into 3 parts: scheme, domain, path
40
+ # @param [ String ] url
41
+ # return [ Hash ] contains parts
42
+ def get_url_path(url)
43
+ pattern = /((?<scheme>(http|https)):\/\/)?(?<domain>[^\/]+)?(?<path>\/.*)?/
44
+ md = pattern.match(url)
45
+ { :scheme => md[:scheme],
46
+ :domain => md[:domain],
47
+ :path => md[:path]}
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,34 @@
1
+ #--
2
+ # NewsCrawler - a website crawler
3
+ #
4
+ # Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
5
+ #
6
+ # This file is part of NewsCrawler.
7
+ #
8
+ # NewsCrawler is free software: you can redistribute it and/or modify
9
+ # it under the terms of the GNU General Public License as published by
10
+ # the Free Software Foundation, either version 3 of the License, or
11
+ # (at your option) any later version.
12
+ #
13
+ # NewsCrawler is distributed in the hope that it will be useful,
14
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
+ # GNU General Public License for more details.
17
+ #
18
+ # You should have received a copy of the GNU General Public License
19
+ # along with NewsCrawler. If not, see <http://www.gnu.org/licenses/>.
20
+ #++
21
+
22
+ require 'robots'
23
+ require 'uri'
24
+ require 'singleton'
25
+
26
+ class Robots
27
+ include Singleton
28
+ attr_accessor :user_agent
29
+
30
+ def initialize
31
+ @user_agent = 'NewsCrawler'
32
+ @parsed = {}
33
+ end
34
+ end
@@ -0,0 +1,47 @@
1
+ # -*- coding: utf-8 -*-
2
+ #--
3
+ # NewsCrawler - a website crawler
4
+ #
5
+ # Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
6
+ #
7
+ # This file is part of NewsCrawler.
8
+ #
9
+ # NewsCrawler is free software: you can redistribute it and/or modify
10
+ # it under the terms of the GNU General Public License as published by
11
+ # the Free Software Foundation, either version 3 of the License, or
12
+ # (at your option) any later version.
13
+ #
14
+ # NewsCrawler is distributed in the hope that it will be useful,
15
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
+ # GNU General Public License for more details.
18
+ #
19
+ # You should have received a copy of the GNU General Public License
20
+ # along with NewsCrawler. If not, see <http://www.gnu.org/licenses/>.
21
+ #++
22
+
23
+ # TODO implement easy API
24
+
25
+ require 'news_crawler/autostart'
26
+ require 'news_crawler/config'
27
+ require 'news_crawler/downloader'
28
+ require 'news_crawler/link_selector/same_domain_selector'
29
+
30
+ NewsCrawler::Storage::RawData.set_engine(:mongo)
31
+ NewsCrawler::Storage::URLQueue.set_engine(:mongo)
32
+
33
+ include NewsCrawler::Storage
34
+
35
+ URLQueue.clear
36
+
37
+ # RawData.clear
38
+ # dwl = NewsCrawler::Downloader.new
39
+ # dwl.run
40
+ # #dwl.async.run
41
+ # #dwl.graceful_terminate
42
+
43
+ URLQueue.mark_all('NewsCrawler::LinkSelector::SameDomainSelector', "unprocessed")
44
+
45
+ puts "Raw entries: #{RawData.count}"
46
+ NewsCrawler::LinkSelector::SameDomainSelector.new
47
+ puts "URL entries: #{URLQueue.count}"
metadata ADDED
@@ -0,0 +1,203 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: news_crawler
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.0.pre.1
5
+ platform: ruby
6
+ authors:
7
+ - Hà Quang Dương
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2013-07-21 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: mongo
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - '>='
18
+ - !ruby/object:Gem::Version
19
+ version: '1.9'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - '>='
25
+ - !ruby/object:Gem::Version
26
+ version: '1.9'
27
+ - !ruby/object:Gem::Dependency
28
+ name: typhoeus
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ~>
32
+ - !ruby/object:Gem::Version
33
+ version: '0.6'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ~>
39
+ - !ruby/object:Gem::Version
40
+ version: '0.6'
41
+ - !ruby/object:Gem::Dependency
42
+ name: nokogiri
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ~>
46
+ - !ruby/object:Gem::Version
47
+ version: '1.5'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ~>
53
+ - !ruby/object:Gem::Version
54
+ version: '1.5'
55
+ - !ruby/object:Gem::Dependency
56
+ name: celluloid
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ~>
60
+ - !ruby/object:Gem::Version
61
+ version: '0.14'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ~>
67
+ - !ruby/object:Gem::Version
68
+ version: '0.14'
69
+ - !ruby/object:Gem::Dependency
70
+ name: simpleconfig
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ~>
74
+ - !ruby/object:Gem::Version
75
+ version: '2.0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ~>
81
+ - !ruby/object:Gem::Version
82
+ version: '2.0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: robots
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ~>
88
+ - !ruby/object:Gem::Version
89
+ version: '0.10'
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ~>
95
+ - !ruby/object:Gem::Version
96
+ version: '0.10'
97
+ - !ruby/object:Gem::Dependency
98
+ name: simplecov
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ~>
102
+ - !ruby/object:Gem::Version
103
+ version: '0.7'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ~>
109
+ - !ruby/object:Gem::Version
110
+ version: '0.7'
111
+ - !ruby/object:Gem::Dependency
112
+ name: minitest
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - ~>
116
+ - !ruby/object:Gem::Version
117
+ version: '5.0'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - ~>
123
+ - !ruby/object:Gem::Version
124
+ version: '5.0'
125
+ - !ruby/object:Gem::Dependency
126
+ name: mocha
127
+ requirement: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - ~>
130
+ - !ruby/object:Gem::Version
131
+ version: '0.14'
132
+ type: :development
133
+ prerelease: false
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - ~>
137
+ - !ruby/object:Gem::Version
138
+ version: '0.14'
139
+ - !ruby/object:Gem::Dependency
140
+ name: coveralls
141
+ requirement: !ruby/object:Gem::Requirement
142
+ requirements:
143
+ - - '>='
144
+ - !ruby/object:Gem::Version
145
+ version: '0'
146
+ type: :development
147
+ prerelease: false
148
+ version_requirements: !ruby/object:Gem::Requirement
149
+ requirements:
150
+ - - '>='
151
+ - !ruby/object:Gem::Version
152
+ version: '0'
153
+ description: A flexible, modular web crawler
154
+ email: contact@haqduong.net
155
+ executables:
156
+ - news_crawler
157
+ extensions: []
158
+ extra_rdoc_files: []
159
+ files:
160
+ - lib/news_crawler.rb
161
+ - lib/news_crawler/autostart.rb
162
+ - lib/news_crawler/config.rb
163
+ - lib/news_crawler/crawler_module.rb
164
+ - lib/news_crawler/downloader.rb
165
+ - lib/news_crawler/link_selector/same_domain_selector.rb
166
+ - lib/news_crawler/nc_logger.rb
167
+ - lib/news_crawler/storage/raw_data.rb
168
+ - lib/news_crawler/storage/raw_data/mongo_storage.rb
169
+ - lib/news_crawler/storage/raw_data/raw_data_engine.rb
170
+ - lib/news_crawler/storage/url_queue.rb
171
+ - lib/news_crawler/storage/url_queue/mongo_storage.rb
172
+ - lib/news_crawler/storage/url_queue/url_queue_engine.rb
173
+ - lib/news_crawler/storage/url_queue/url_queue_error.rb
174
+ - lib/news_crawler/url_helper.rb
175
+ - lib/news_crawler/utils/robots_patch.rb
176
+ - lib/news_crawler/default_config.yml
177
+ - lib/news_crawler/default_sds.yml
178
+ - bin/news_crawler
179
+ homepage: http://haqduong.github.io/news_crawler/
180
+ licenses:
181
+ - GPLv3
182
+ metadata: {}
183
+ post_install_message:
184
+ rdoc_options: []
185
+ require_paths:
186
+ - lib
187
+ required_ruby_version: !ruby/object:Gem::Requirement
188
+ requirements:
189
+ - - '>='
190
+ - !ruby/object:Gem::Version
191
+ version: 2.0.0
192
+ required_rubygems_version: !ruby/object:Gem::Requirement
193
+ requirements:
194
+ - - '>'
195
+ - !ruby/object:Gem::Version
196
+ version: 1.3.1
197
+ requirements: []
198
+ rubyforge_project:
199
+ rubygems_version: 2.0.3
200
+ signing_key:
201
+ specification_version: 4
202
+ summary: News crawler
203
+ test_files: []