kabutops 0.1.1 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 71776df69e9efa96e851421687e8fa1b9d50c8ed
4
- data.tar.gz: a4d5a921f92bcd9c94162df7a98f811cd0ed5b98
3
+ metadata.gz: 47c19efbb1d1f231ba44e5d19b0b29e52e948348
4
+ data.tar.gz: e9b7acd6a3808155f82ae4d74c83afb7f208fb16
5
5
  SHA512:
6
- metadata.gz: f6460d0155f5a646575870d8d6528d5f526befdf4dcfbe5815365fb5b4d75a1d9b513de505b5bf98444432a02fe2c44f148d3c99b4dc1f00e2f03e5b10adad03
7
- data.tar.gz: d9e22e3f0789629535b9c57a8ab1b73c2864f76c367d6d9579b1638255fe1e4827843c61cddae793ed3beb6c3e83cedef8695fbfa7257a726b65f61d676bd83b
6
+ metadata.gz: c9a9704dd8c0009fe35dbc47d0199246f67f4f4435730ce28247bac895394780abc1a17cf1b68c7e42e6a7f4665dcd8a4fb468890588ad58ec973a5708de0f48
7
+ data.tar.gz: c56de35682752cf9b672a9aac4494a10d8edc94c801e5dedda9dfe3c29350c2907969eb8e100866131a27a52a28e73fdf4139511cf0097993b6a48ba682795f7
data/README.md CHANGED
@@ -129,6 +129,72 @@ Documents saved in the ElasticSearch will look like this one
129
129
  }
130
130
  ```
131
131
 
132
+ Advanced
133
+ --------
134
+
135
+ ```ruby
136
+ class SomeCrawler < Kabutops::Crawler
137
+ collection [
138
+ {
139
+ id: 'some_id',
140
+ url: 'some_url.com/some_id',
141
+ },
142
+ ]
143
+ agent ->{
144
+ # this call will be called before every request
145
+ # you should return agent that takes 'get' method
146
+ }
147
+ proxy 'proxy_host.com', 1234 # proxy host and port
148
+ wait 7 # wait X seconds after every crawl
149
+ skip_existing true # if :id exists in db resource wont't be crawled again
150
+
151
+ elasticsearch do
152
+ host 'some_host.com'
153
+ port 12345
154
+ index :name_of_index
155
+ type :type_of_es_doc
156
+
157
+ data each: 'xpath if multiple records are located on one site' do
158
+ # attrs
159
+
160
+ attr1 :xpath, '//*[@class="bla"]', :int # convert value to int
161
+ attr2 :css, '.bla', :float # convert value to float
162
+ end
163
+
164
+ callbacks do
165
+ before_save do |result|
166
+ # result is a hash that will be saved to the db
167
+ # you can alter result before save
168
+ end
169
+
170
+ after_save do |result|
171
+ # result has been successfully saved to the db
172
+ end
173
+
174
+ save_if do |resource, page, result|
175
+ # if false or nil is returned record is not saved to the db
176
+ end
177
+ end
178
+ end
179
+
180
+ callbacks do
181
+ after_crawl do |resource, page|
182
+ # page has been successfully crawled
183
+ end
184
+
185
+ before_cache do |resource, page|
186
+ # if caching is enabled you can check page here
187
+ # by throwing exception you can interrupt caching and
188
+ # resource processing
189
+ end
190
+
191
+ store_if do
192
+ # if false or nil is returned page is not processed
193
+ end
194
+ end
195
+ end
196
+ ```
197
+
132
198
  Debugging
133
199
  ---------
134
200
 
@@ -29,6 +29,8 @@ module Kabutops
29
29
  page.xpath(@value).text.gsub(/\u00a0/, ' ').strip
30
30
  when :lambda, :proc
31
31
  @value.call(resource, page)
32
+ when :const, :static
33
+ @value
32
34
  else
33
35
  raise "unknown recipe item type '#{item.type}'"
34
36
  end
@@ -45,5 +47,4 @@ module Kabutops
45
47
  end
46
48
  end
47
49
 
48
-
49
50
  end
@@ -1,5 +1,5 @@
1
1
  # -*- encoding : utf-8 -*-
2
2
 
3
3
  module Kabutops
4
- VERSION = '0.1.1'
4
+ VERSION = '0.1.2'
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kabutops
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Rene Klacan
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-07-23 00:00:00.000000000 Z
11
+ date: 2014-08-10 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: mechanize