news_crawler 0.0.3.pre.1 → 0.0.3.pre.3
Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 65491e340a0cb59c0ecac8f6a78d2dd0405fce65
|
4
|
+
data.tar.gz: 44a9cc62f420914287f42856928db6a3ff9714c9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: bc1755b0fe3837399d9fb8143932ded28a711b0b36f257a7c98e7440c469ca065f63c5065058c84177ee1f15646b3b51058f8cc3188801470e016b3a023a2eff
|
7
|
+
data.tar.gz: f13e0d7d83c67818ed3d73ff6f4d423f99c18df6d9ff2118e651580668cc102484cbdbf2f4d74ab93360656b9285beb190d7291c952930d58ba6eb7cd2bf275f
|
data/lib/news_crawler/config.rb
CHANGED
@@ -21,6 +21,7 @@
|
|
21
21
|
#++
|
22
22
|
|
23
23
|
require 'simple_config'
|
24
|
+
require 'yaml'
|
24
25
|
|
25
26
|
module NewsCrawler
|
26
27
|
class CrawlerConfig
|
@@ -32,17 +33,32 @@ module NewsCrawler
|
|
32
33
|
def self.load_application_config(file = CrawlerConfig::DEFAULT_CONFIG)
|
33
34
|
if ((file != DEFAULT_CONFIG) || (@app_loaded != true))
|
34
35
|
@app_loaded = true
|
35
|
-
|
36
|
-
load file
|
37
|
-
end
|
36
|
+
merge_config(:application, file)
|
38
37
|
end
|
39
38
|
end
|
40
39
|
|
41
40
|
def self.load_samedomainselector_config(file = CrawlerConfig::DEFAULT_SDS_CONFIG)
|
42
41
|
if ((file != DEFAULT_SDS_CONFIG) || (@sds_loaded != true))
|
43
42
|
@sds_loaded = true
|
44
|
-
|
45
|
-
|
43
|
+
merge_config(:same_domain_selector, file)
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def self.merge_config(mod, file)
|
48
|
+
conf = YAML.load_file(file)
|
49
|
+
conf.each do | key, val |
|
50
|
+
if val.is_a? Hash
|
51
|
+
val.each do | k1, v1 |
|
52
|
+
SimpleConfig.for mod do
|
53
|
+
group key do
|
54
|
+
set k1, v1
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
else
|
59
|
+
SimpleConfig.for mod do
|
60
|
+
set key, val
|
61
|
+
end
|
46
62
|
end
|
47
63
|
end
|
48
64
|
end
|
@@ -119,10 +119,12 @@ module NewsCrawler
|
|
119
119
|
return false
|
120
120
|
end
|
121
121
|
|
122
|
-
exclude_group.
|
123
|
-
|
124
|
-
|
125
|
-
|
122
|
+
unless exclude_group.nil?
|
123
|
+
exclude_group.to_hash.keys.each do | url_e |
|
124
|
+
if url_domain.to_s.end_with? url_e.to_s
|
125
|
+
exclude_list = config.exclude.get(url_e)
|
126
|
+
break
|
127
|
+
end
|
126
128
|
end
|
127
129
|
end
|
128
130
|
|
@@ -23,6 +23,7 @@
|
|
23
23
|
require 'mongo'
|
24
24
|
require 'simple_config'
|
25
25
|
require 'news_crawler/storage/raw_data/raw_data_engine'
|
26
|
+
require 'news_crawler/nc_logger'
|
26
27
|
|
27
28
|
|
28
29
|
module NewsCrawler
|
@@ -46,6 +47,7 @@ module NewsCrawler
|
|
46
47
|
# @param [ String ] url
|
47
48
|
# @param [ String ] body
|
48
49
|
def add(url, body)
|
50
|
+
body.encode!('utf-8', :invalid => :replace, :undef => :replace)
|
49
51
|
@coll.update({:url => url},
|
50
52
|
{:$set => {:body => body}},
|
51
53
|
{:upsert => true})
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: news_crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.3.pre.
|
4
|
+
version: 0.0.3.pre.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Hà Quang Dương
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-07-
|
11
|
+
date: 2013-07-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: mongo
|