news_crawler 0.0.0.pre.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,50 @@
1
+ #--
2
+ # NewsCrawler - a website crawler
3
+ #
4
+ # Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
5
+ #
6
+ # This file is part of NewsCrawler.
7
+ #
8
+ # NewsCrawler is free software: you can redistribute it and/or modify
9
+ # it under the terms of the GNU General Public License as published by
10
+ # the Free Software Foundation, either version 3 of the License, or
11
+ # (at your option) any later version.
12
+ #
13
+ # NewsCrawler is distributed in the hope that it will be useful,
14
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
+ # GNU General Public License for more details.
17
+ #
18
+ # You should have received a copy of the GNU General Public License
19
+ # along with NewsCrawler. If not, see <http://www.gnu.org/licenses/>.
20
+ #++
21
+
22
+ module NewsCrawler
23
+ # Contains various method for processing url
24
+ module URLHelper
25
+ # produce true if 2 urls belong to same domain
26
+ # @param [ String ] url1 Url 1
27
+ # @param [ String ] url2 Url 2
28
+ # @return [ Boolean ] true if both url belong to same domain
29
+ def same_domain?(url1, url2)
30
+ p1 = get_url_path(url1)
31
+ p2 = get_url_path(url2)
32
+ d1 = p1[:domain].split('.').reverse
33
+ d2 = p2[:domain].split('.').reverse
34
+ d1.zip(d2).inject(true) do | mem, obj |
35
+ mem = mem && ((obj[0] == obj[1]) || (obj[0].nil? || obj[1].nil?))
36
+ end
37
+ end
38
+
39
+ # split URL into 3 parts: scheme, domain, path
40
+ # @param [ String ] url
41
+ # return [ Hash ] contains parts
42
+ def get_url_path(url)
43
+ pattern = /((?<scheme>(http|https)):\/\/)?(?<domain>[^\/]+)?(?<path>\/.*)?/
44
+ md = pattern.match(url)
45
+ { :scheme => md[:scheme],
46
+ :domain => md[:domain],
47
+ :path => md[:path]}
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,34 @@
1
+ #--
2
+ # NewsCrawler - a website crawler
3
+ #
4
+ # Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
5
+ #
6
+ # This file is part of NewsCrawler.
7
+ #
8
+ # NewsCrawler is free software: you can redistribute it and/or modify
9
+ # it under the terms of the GNU General Public License as published by
10
+ # the Free Software Foundation, either version 3 of the License, or
11
+ # (at your option) any later version.
12
+ #
13
+ # NewsCrawler is distributed in the hope that it will be useful,
14
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
+ # GNU General Public License for more details.
17
+ #
18
+ # You should have received a copy of the GNU General Public License
19
+ # along with NewsCrawler. If not, see <http://www.gnu.org/licenses/>.
20
+ #++
21
+
22
+ require 'robots'
23
+ require 'uri'
24
+ require 'singleton'
25
+
26
+ class Robots
27
+ include Singleton
28
+ attr_accessor :user_agent
29
+
30
+ def initialize
31
+ @user_agent = 'NewsCrawler'
32
+ @parsed = {}
33
+ end
34
+ end
@@ -0,0 +1,47 @@
1
+ # -*- coding: utf-8 -*-
2
+ #--
3
+ # NewsCrawler - a website crawler
4
+ #
5
+ # Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
6
+ #
7
+ # This file is part of NewsCrawler.
8
+ #
9
+ # NewsCrawler is free software: you can redistribute it and/or modify
10
+ # it under the terms of the GNU General Public License as published by
11
+ # the Free Software Foundation, either version 3 of the License, or
12
+ # (at your option) any later version.
13
+ #
14
+ # NewsCrawler is distributed in the hope that it will be useful,
15
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
+ # GNU General Public License for more details.
18
+ #
19
+ # You should have received a copy of the GNU General Public License
20
+ # along with NewsCrawler. If not, see <http://www.gnu.org/licenses/>.
21
+ #++
22
+
23
+ # TODO implement easy API
24
+
25
+ require 'news_crawler/autostart'
26
+ require 'news_crawler/config'
27
+ require 'news_crawler/downloader'
28
+ require 'news_crawler/link_selector/same_domain_selector'
29
+
30
+ NewsCrawler::Storage::RawData.set_engine(:mongo)
31
+ NewsCrawler::Storage::URLQueue.set_engine(:mongo)
32
+
33
+ include NewsCrawler::Storage
34
+
35
+ URLQueue.clear
36
+
37
+ # RawData.clear
38
+ # dwl = NewsCrawler::Downloader.new
39
+ # dwl.run
40
+ # #dwl.async.run
41
+ # #dwl.graceful_terminate
42
+
43
+ URLQueue.mark_all('NewsCrawler::LinkSelector::SameDomainSelector', "unprocessed")
44
+
45
+ puts "Raw entries: #{RawData.count}"
46
+ NewsCrawler::LinkSelector::SameDomainSelector.new
47
+ puts "URL entries: #{URLQueue.count}"
metadata ADDED
@@ -0,0 +1,203 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: news_crawler
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.0.pre.1
5
+ platform: ruby
6
+ authors:
7
+ - Hà Quang Dương
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2013-07-21 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: mongo
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - '>='
18
+ - !ruby/object:Gem::Version
19
+ version: '1.9'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - '>='
25
+ - !ruby/object:Gem::Version
26
+ version: '1.9'
27
+ - !ruby/object:Gem::Dependency
28
+ name: typhoeus
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ~>
32
+ - !ruby/object:Gem::Version
33
+ version: '0.6'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ~>
39
+ - !ruby/object:Gem::Version
40
+ version: '0.6'
41
+ - !ruby/object:Gem::Dependency
42
+ name: nokogiri
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ~>
46
+ - !ruby/object:Gem::Version
47
+ version: '1.5'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ~>
53
+ - !ruby/object:Gem::Version
54
+ version: '1.5'
55
+ - !ruby/object:Gem::Dependency
56
+ name: celluloid
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ~>
60
+ - !ruby/object:Gem::Version
61
+ version: '0.14'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ~>
67
+ - !ruby/object:Gem::Version
68
+ version: '0.14'
69
+ - !ruby/object:Gem::Dependency
70
+ name: simpleconfig
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ~>
74
+ - !ruby/object:Gem::Version
75
+ version: '2.0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ~>
81
+ - !ruby/object:Gem::Version
82
+ version: '2.0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: robots
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ~>
88
+ - !ruby/object:Gem::Version
89
+ version: '0.10'
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ~>
95
+ - !ruby/object:Gem::Version
96
+ version: '0.10'
97
+ - !ruby/object:Gem::Dependency
98
+ name: simplecov
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ~>
102
+ - !ruby/object:Gem::Version
103
+ version: '0.7'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ~>
109
+ - !ruby/object:Gem::Version
110
+ version: '0.7'
111
+ - !ruby/object:Gem::Dependency
112
+ name: minitest
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - ~>
116
+ - !ruby/object:Gem::Version
117
+ version: '5.0'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - ~>
123
+ - !ruby/object:Gem::Version
124
+ version: '5.0'
125
+ - !ruby/object:Gem::Dependency
126
+ name: mocha
127
+ requirement: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - ~>
130
+ - !ruby/object:Gem::Version
131
+ version: '0.14'
132
+ type: :development
133
+ prerelease: false
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - ~>
137
+ - !ruby/object:Gem::Version
138
+ version: '0.14'
139
+ - !ruby/object:Gem::Dependency
140
+ name: coveralls
141
+ requirement: !ruby/object:Gem::Requirement
142
+ requirements:
143
+ - - '>='
144
+ - !ruby/object:Gem::Version
145
+ version: '0'
146
+ type: :development
147
+ prerelease: false
148
+ version_requirements: !ruby/object:Gem::Requirement
149
+ requirements:
150
+ - - '>='
151
+ - !ruby/object:Gem::Version
152
+ version: '0'
153
+ description: A flexible, modular web crawler
154
+ email: contact@haqduong.net
155
+ executables:
156
+ - news_crawler
157
+ extensions: []
158
+ extra_rdoc_files: []
159
+ files:
160
+ - lib/news_crawler.rb
161
+ - lib/news_crawler/autostart.rb
162
+ - lib/news_crawler/config.rb
163
+ - lib/news_crawler/crawler_module.rb
164
+ - lib/news_crawler/downloader.rb
165
+ - lib/news_crawler/link_selector/same_domain_selector.rb
166
+ - lib/news_crawler/nc_logger.rb
167
+ - lib/news_crawler/storage/raw_data.rb
168
+ - lib/news_crawler/storage/raw_data/mongo_storage.rb
169
+ - lib/news_crawler/storage/raw_data/raw_data_engine.rb
170
+ - lib/news_crawler/storage/url_queue.rb
171
+ - lib/news_crawler/storage/url_queue/mongo_storage.rb
172
+ - lib/news_crawler/storage/url_queue/url_queue_engine.rb
173
+ - lib/news_crawler/storage/url_queue/url_queue_error.rb
174
+ - lib/news_crawler/url_helper.rb
175
+ - lib/news_crawler/utils/robots_patch.rb
176
+ - lib/news_crawler/default_config.yml
177
+ - lib/news_crawler/default_sds.yml
178
+ - bin/news_crawler
179
+ homepage: http://haqduong.github.io/news_crawler/
180
+ licenses:
181
+ - GPLv3
182
+ metadata: {}
183
+ post_install_message:
184
+ rdoc_options: []
185
+ require_paths:
186
+ - lib
187
+ required_ruby_version: !ruby/object:Gem::Requirement
188
+ requirements:
189
+ - - '>='
190
+ - !ruby/object:Gem::Version
191
+ version: 2.0.0
192
+ required_rubygems_version: !ruby/object:Gem::Requirement
193
+ requirements:
194
+ - - '>'
195
+ - !ruby/object:Gem::Version
196
+ version: 1.3.1
197
+ requirements: []
198
+ rubyforge_project:
199
+ rubygems_version: 2.0.3
200
+ signing_key:
201
+ specification_version: 4
202
+ summary: News crawler
203
+ test_files: []