news_crawler 0.0.3.pre.1 → 0.0.3.pre.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA1:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 65491e340a0cb59c0ecac8f6a78d2dd0405fce65
|
|
4
|
+
data.tar.gz: 44a9cc62f420914287f42856928db6a3ff9714c9
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: bc1755b0fe3837399d9fb8143932ded28a711b0b36f257a7c98e7440c469ca065f63c5065058c84177ee1f15646b3b51058f8cc3188801470e016b3a023a2eff
|
|
7
|
+
data.tar.gz: f13e0d7d83c67818ed3d73ff6f4d423f99c18df6d9ff2118e651580668cc102484cbdbf2f4d74ab93360656b9285beb190d7291c952930d58ba6eb7cd2bf275f
|
data/lib/news_crawler/config.rb
CHANGED
|
@@ -21,6 +21,7 @@
|
|
|
21
21
|
#++
|
|
22
22
|
|
|
23
23
|
require 'simple_config'
|
|
24
|
+
require 'yaml'
|
|
24
25
|
|
|
25
26
|
module NewsCrawler
|
|
26
27
|
class CrawlerConfig
|
|
@@ -32,17 +33,32 @@ module NewsCrawler
|
|
|
32
33
|
def self.load_application_config(file = CrawlerConfig::DEFAULT_CONFIG)
|
|
33
34
|
if ((file != DEFAULT_CONFIG) || (@app_loaded != true))
|
|
34
35
|
@app_loaded = true
|
|
35
|
-
|
|
36
|
-
load file
|
|
37
|
-
end
|
|
36
|
+
merge_config(:application, file)
|
|
38
37
|
end
|
|
39
38
|
end
|
|
40
39
|
|
|
41
40
|
def self.load_samedomainselector_config(file = CrawlerConfig::DEFAULT_SDS_CONFIG)
|
|
42
41
|
if ((file != DEFAULT_SDS_CONFIG) || (@sds_loaded != true))
|
|
43
42
|
@sds_loaded = true
|
|
44
|
-
|
|
45
|
-
|
|
43
|
+
merge_config(:same_domain_selector, file)
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def self.merge_config(mod, file)
|
|
48
|
+
conf = YAML.load_file(file)
|
|
49
|
+
conf.each do | key, val |
|
|
50
|
+
if val.is_a? Hash
|
|
51
|
+
val.each do | k1, v1 |
|
|
52
|
+
SimpleConfig.for mod do
|
|
53
|
+
group key do
|
|
54
|
+
set k1, v1
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
else
|
|
59
|
+
SimpleConfig.for mod do
|
|
60
|
+
set key, val
|
|
61
|
+
end
|
|
46
62
|
end
|
|
47
63
|
end
|
|
48
64
|
end
|
|
@@ -119,10 +119,12 @@ module NewsCrawler
|
|
|
119
119
|
return false
|
|
120
120
|
end
|
|
121
121
|
|
|
122
|
-
exclude_group.
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
122
|
+
unless exclude_group.nil?
|
|
123
|
+
exclude_group.to_hash.keys.each do | url_e |
|
|
124
|
+
if url_domain.to_s.end_with? url_e.to_s
|
|
125
|
+
exclude_list = config.exclude.get(url_e)
|
|
126
|
+
break
|
|
127
|
+
end
|
|
126
128
|
end
|
|
127
129
|
end
|
|
128
130
|
|
|
@@ -23,6 +23,7 @@
|
|
|
23
23
|
require 'mongo'
|
|
24
24
|
require 'simple_config'
|
|
25
25
|
require 'news_crawler/storage/raw_data/raw_data_engine'
|
|
26
|
+
require 'news_crawler/nc_logger'
|
|
26
27
|
|
|
27
28
|
|
|
28
29
|
module NewsCrawler
|
|
@@ -46,6 +47,7 @@ module NewsCrawler
|
|
|
46
47
|
# @param [ String ] url
|
|
47
48
|
# @param [ String ] body
|
|
48
49
|
def add(url, body)
|
|
50
|
+
body.encode!('utf-8', :invalid => :replace, :undef => :replace)
|
|
49
51
|
@coll.update({:url => url},
|
|
50
52
|
{:$set => {:body => body}},
|
|
51
53
|
{:upsert => true})
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: news_crawler
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.0.3.pre.
|
|
4
|
+
version: 0.0.3.pre.3
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Hà Quang Dương
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2013-07-
|
|
11
|
+
date: 2013-07-28 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: mongo
|