news_crawler 0.0.3.pre.1 → 0.0.3.pre.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: e52eaf3135e05aba597c96a7e9cafa43dba73cc8
4
- data.tar.gz: 7e50f113f2b5be1f4932eb7ba83975e216777c89
3
+ metadata.gz: 65491e340a0cb59c0ecac8f6a78d2dd0405fce65
4
+ data.tar.gz: 44a9cc62f420914287f42856928db6a3ff9714c9
5
5
  SHA512:
6
- metadata.gz: 1ce7400c42047ad78954b0e4f45e1493fb7833926e69864a0814662e5ddac4f6423247083fe943fbab33085ad27c410dbb500326690c74290821bf3b24b2454a
7
- data.tar.gz: 66f0fd7d05b3fae8e9c12684b16240e2e9ae0add263018ec49e82564d8b1489a9564a3e8899a1a975a145500c5eee9afe002a9644b368bc0cf2c5206328d71fd
6
+ metadata.gz: bc1755b0fe3837399d9fb8143932ded28a711b0b36f257a7c98e7440c469ca065f63c5065058c84177ee1f15646b3b51058f8cc3188801470e016b3a023a2eff
7
+ data.tar.gz: f13e0d7d83c67818ed3d73ff6f4d423f99c18df6d9ff2118e651580668cc102484cbdbf2f4d74ab93360656b9285beb190d7291c952930d58ba6eb7cd2bf275f
@@ -21,6 +21,7 @@
21
21
  #++
22
22
 
23
23
  require 'simple_config'
24
+ require 'yaml'
24
25
 
25
26
  module NewsCrawler
26
27
  class CrawlerConfig
@@ -32,17 +33,32 @@ module NewsCrawler
32
33
  def self.load_application_config(file = CrawlerConfig::DEFAULT_CONFIG)
33
34
  if ((file != DEFAULT_CONFIG) || (@app_loaded != true))
34
35
  @app_loaded = true
35
- SimpleConfig.for :application do
36
- load file
37
- end
36
+ merge_config(:application, file)
38
37
  end
39
38
  end
40
39
 
41
40
  def self.load_samedomainselector_config(file = CrawlerConfig::DEFAULT_SDS_CONFIG)
42
41
  if ((file != DEFAULT_SDS_CONFIG) || (@sds_loaded != true))
43
42
  @sds_loaded = true
44
- SimpleConfig.for :same_domain_selector do
45
- load file
43
+ merge_config(:same_domain_selector, file)
44
+ end
45
+ end
46
+
47
+ def self.merge_config(mod, file)
48
+ conf = YAML.load_file(file)
49
+ conf.each do | key, val |
50
+ if val.is_a? Hash
51
+ val.each do | k1, v1 |
52
+ SimpleConfig.for mod do
53
+ group key do
54
+ set k1, v1
55
+ end
56
+ end
57
+ end
58
+ else
59
+ SimpleConfig.for mod do
60
+ set key, val
61
+ end
46
62
  end
47
63
  end
48
64
  end
@@ -119,10 +119,12 @@ module NewsCrawler
119
119
  return false
120
120
  end
121
121
 
122
- exclude_group.to_hash.keys.each do | url_e |
123
- if url_domain.to_s.end_with? url_e.to_s
124
- exclude_list = config.exclude.get(url_e)
125
- break
122
+ unless exclude_group.nil?
123
+ exclude_group.to_hash.keys.each do | url_e |
124
+ if url_domain.to_s.end_with? url_e.to_s
125
+ exclude_list = config.exclude.get(url_e)
126
+ break
127
+ end
126
128
  end
127
129
  end
128
130
 
@@ -23,6 +23,7 @@
23
23
  require 'mongo'
24
24
  require 'simple_config'
25
25
  require 'news_crawler/storage/raw_data/raw_data_engine'
26
+ require 'news_crawler/nc_logger'
26
27
 
27
28
 
28
29
  module NewsCrawler
@@ -46,6 +47,7 @@ module NewsCrawler
46
47
  # @param [ String ] url
47
48
  # @param [ String ] body
48
49
  def add(url, body)
50
+ body.encode!('utf-8', :invalid => :replace, :undef => :replace)
49
51
  @coll.update({:url => url},
50
52
  {:$set => {:body => body}},
51
53
  {:upsert => true})
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: news_crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3.pre.1
4
+ version: 0.0.3.pre.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Hà Quang Dương
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-07-23 00:00:00.000000000 Z
11
+ date: 2013-07-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: mongo