news_crawler 0.0.3.pre.1 → 0.0.3.pre.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: e52eaf3135e05aba597c96a7e9cafa43dba73cc8
4
- data.tar.gz: 7e50f113f2b5be1f4932eb7ba83975e216777c89
3
+ metadata.gz: 65491e340a0cb59c0ecac8f6a78d2dd0405fce65
4
+ data.tar.gz: 44a9cc62f420914287f42856928db6a3ff9714c9
5
5
  SHA512:
6
- metadata.gz: 1ce7400c42047ad78954b0e4f45e1493fb7833926e69864a0814662e5ddac4f6423247083fe943fbab33085ad27c410dbb500326690c74290821bf3b24b2454a
7
- data.tar.gz: 66f0fd7d05b3fae8e9c12684b16240e2e9ae0add263018ec49e82564d8b1489a9564a3e8899a1a975a145500c5eee9afe002a9644b368bc0cf2c5206328d71fd
6
+ metadata.gz: bc1755b0fe3837399d9fb8143932ded28a711b0b36f257a7c98e7440c469ca065f63c5065058c84177ee1f15646b3b51058f8cc3188801470e016b3a023a2eff
7
+ data.tar.gz: f13e0d7d83c67818ed3d73ff6f4d423f99c18df6d9ff2118e651580668cc102484cbdbf2f4d74ab93360656b9285beb190d7291c952930d58ba6eb7cd2bf275f
@@ -21,6 +21,7 @@
21
21
  #++
22
22
 
23
23
  require 'simple_config'
24
+ require 'yaml'
24
25
 
25
26
  module NewsCrawler
26
27
  class CrawlerConfig
@@ -32,17 +33,32 @@ module NewsCrawler
32
33
  def self.load_application_config(file = CrawlerConfig::DEFAULT_CONFIG)
33
34
  if ((file != DEFAULT_CONFIG) || (@app_loaded != true))
34
35
  @app_loaded = true
35
- SimpleConfig.for :application do
36
- load file
37
- end
36
+ merge_config(:application, file)
38
37
  end
39
38
  end
40
39
 
41
40
  def self.load_samedomainselector_config(file = CrawlerConfig::DEFAULT_SDS_CONFIG)
42
41
  if ((file != DEFAULT_SDS_CONFIG) || (@sds_loaded != true))
43
42
  @sds_loaded = true
44
- SimpleConfig.for :same_domain_selector do
45
- load file
43
+ merge_config(:same_domain_selector, file)
44
+ end
45
+ end
46
+
47
+ def self.merge_config(mod, file)
48
+ conf = YAML.load_file(file)
49
+ conf.each do | key, val |
50
+ if val.is_a? Hash
51
+ val.each do | k1, v1 |
52
+ SimpleConfig.for mod do
53
+ group key do
54
+ set k1, v1
55
+ end
56
+ end
57
+ end
58
+ else
59
+ SimpleConfig.for mod do
60
+ set key, val
61
+ end
46
62
  end
47
63
  end
48
64
  end
@@ -119,10 +119,12 @@ module NewsCrawler
119
119
  return false
120
120
  end
121
121
 
122
- exclude_group.to_hash.keys.each do | url_e |
123
- if url_domain.to_s.end_with? url_e.to_s
124
- exclude_list = config.exclude.get(url_e)
125
- break
122
+ unless exclude_group.nil?
123
+ exclude_group.to_hash.keys.each do | url_e |
124
+ if url_domain.to_s.end_with? url_e.to_s
125
+ exclude_list = config.exclude.get(url_e)
126
+ break
127
+ end
126
128
  end
127
129
  end
128
130
 
@@ -23,6 +23,7 @@
23
23
  require 'mongo'
24
24
  require 'simple_config'
25
25
  require 'news_crawler/storage/raw_data/raw_data_engine'
26
+ require 'news_crawler/nc_logger'
26
27
 
27
28
 
28
29
  module NewsCrawler
@@ -46,6 +47,7 @@ module NewsCrawler
46
47
  # @param [ String ] url
47
48
  # @param [ String ] body
48
49
  def add(url, body)
50
+ body.encode!('utf-8', :invalid => :replace, :undef => :replace)
49
51
  @coll.update({:url => url},
50
52
  {:$set => {:body => body}},
51
53
  {:upsert => true})
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: news_crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3.pre.1
4
+ version: 0.0.3.pre.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Hà Quang Dương
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-07-23 00:00:00.000000000 Z
11
+ date: 2013-07-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: mongo