kabutops 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 71776df69e9efa96e851421687e8fa1b9d50c8ed
4
- data.tar.gz: a4d5a921f92bcd9c94162df7a98f811cd0ed5b98
3
+ metadata.gz: 47c19efbb1d1f231ba44e5d19b0b29e52e948348
4
+ data.tar.gz: e9b7acd6a3808155f82ae4d74c83afb7f208fb16
5
5
  SHA512:
6
- metadata.gz: f6460d0155f5a646575870d8d6528d5f526befdf4dcfbe5815365fb5b4d75a1d9b513de505b5bf98444432a02fe2c44f148d3c99b4dc1f00e2f03e5b10adad03
7
- data.tar.gz: d9e22e3f0789629535b9c57a8ab1b73c2864f76c367d6d9579b1638255fe1e4827843c61cddae793ed3beb6c3e83cedef8695fbfa7257a726b65f61d676bd83b
6
+ metadata.gz: c9a9704dd8c0009fe35dbc47d0199246f67f4f4435730ce28247bac895394780abc1a17cf1b68c7e42e6a7f4665dcd8a4fb468890588ad58ec973a5708de0f48
7
+ data.tar.gz: c56de35682752cf9b672a9aac4494a10d8edc94c801e5dedda9dfe3c29350c2907969eb8e100866131a27a52a28e73fdf4139511cf0097993b6a48ba682795f7
data/README.md CHANGED
@@ -129,6 +129,72 @@ Documents saved in the ElasticSearch will look like this one
129
129
  }
130
130
  ```
131
131
 
132
+ Advanced
133
+ --------
134
+
135
+ ```ruby
136
+ class SomeCrawler < Kabutops::Crawler
137
+ collection [
138
+ {
139
+ id: 'some_id',
140
+ url: 'some_url.com/some_id',
141
+ },
142
+ ]
143
+ agent ->{
144
+ # this call will be called before every request
145
+ # you should return agent that takes 'get' method
146
+ }
147
+ proxy 'proxy_host.com', 1234 # proxy host and port
148
+ wait 7 # wait X seconds after every crawl
149
+ skip_existing true # if :id exists in db resource wont't be crawled again
150
+
151
+ elasticsearch do
152
+ host 'some_host.com'
153
+ port 12345
154
+ index :name_of_index
155
+ type :type_of_es_doc
156
+
157
+ data each: 'xpath if multiple records are located on one site' do
158
+ # attrs
159
+
160
+ attr1 :xpath, '//*[@class="bla"]', :int # convert value to int
161
+ attr2 :css, '.bla', :float # convert value to float
162
+ end
163
+
164
+ callbacks do
165
+ before_save do |result|
166
+ # result is a hash that will be saved to the db
167
+ # you can alter result before save
168
+ end
169
+
170
+ after_save do |result|
171
+ # result has been successfully saved to the db
172
+ end
173
+
174
+ save_if do |resource, page, result|
175
+ # if false or nil is returned record is not saved to the db
176
+ end
177
+ end
178
+ end
179
+
180
+ callbacks do
181
+ after_crawl do |resource, page|
182
+ # page has been successfully crawled
183
+ end
184
+
185
+ before_cache do |resource, page|
186
+ # if caching is enabled you can check page here
187
+ # by throwing exception you can interrupt caching and
188
+ # resource processing
189
+ end
190
+
191
+ store_if do
192
+ # if false or nil is returned page is not processed
193
+ end
194
+ end
195
+ end
196
+ ```
197
+
132
198
  Debugging
133
199
  ---------
134
200
 
@@ -29,6 +29,8 @@ module Kabutops
29
29
  page.xpath(@value).text.gsub(/\u00a0/, ' ').strip
30
30
  when :lambda, :proc
31
31
  @value.call(resource, page)
32
+ when :const, :static
33
+ @value
32
34
  else
33
35
  raise "unknown recipe item type '#{item.type}'"
34
36
  end
@@ -45,5 +47,4 @@ module Kabutops
45
47
  end
46
48
  end
47
49
 
48
-
49
50
  end
@@ -1,5 +1,5 @@
1
1
  # -*- encoding : utf-8 -*-
2
2
 
3
3
  module Kabutops
4
- VERSION = '0.1.1'
4
+ VERSION = '0.1.2'
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kabutops
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Rene Klacan
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-07-23 00:00:00.000000000 Z
11
+ date: 2014-08-10 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: mechanize