kabutops 0.1.1 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +66 -0
- data/lib/kabutops/recipe_item.rb +2 -1
- data/lib/kabutops/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 47c19efbb1d1f231ba44e5d19b0b29e52e948348
|
4
|
+
data.tar.gz: e9b7acd6a3808155f82ae4d74c83afb7f208fb16
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c9a9704dd8c0009fe35dbc47d0199246f67f4f4435730ce28247bac895394780abc1a17cf1b68c7e42e6a7f4665dcd8a4fb468890588ad58ec973a5708de0f48
|
7
|
+
data.tar.gz: c56de35682752cf9b672a9aac4494a10d8edc94c801e5dedda9dfe3c29350c2907969eb8e100866131a27a52a28e73fdf4139511cf0097993b6a48ba682795f7
|
data/README.md
CHANGED
@@ -129,6 +129,72 @@ Documents saved in the ElasticSearch will look like this one
|
|
129
129
|
}
|
130
130
|
```
|
131
131
|
|
132
|
+
Advanced
|
133
|
+
--------
|
134
|
+
|
135
|
+
```ruby
|
136
|
+
class SomeCrawler < Kabutops::Crawler
|
137
|
+
collection [
|
138
|
+
{
|
139
|
+
id: 'some_id',
|
140
|
+
url: 'some_url.com/some_id',
|
141
|
+
},
|
142
|
+
]
|
143
|
+
agent ->{
|
144
|
+
# this call will be called before every request
|
145
|
+
# you should return agent that takes 'get' method
|
146
|
+
}
|
147
|
+
proxy 'proxy_host.com', 1234 # proxy host and port
|
148
|
+
wait 7 # wait X seconds after every crawl
|
149
|
+
skip_existing true # if :id exists in db resource wont't be crawled again
|
150
|
+
|
151
|
+
elasticsearch do
|
152
|
+
host 'some_host.com'
|
153
|
+
port 12345
|
154
|
+
index :name_of_index
|
155
|
+
type :type_of_es_doc
|
156
|
+
|
157
|
+
data each: 'xpath if multiple records are located on one site' do
|
158
|
+
# attrs
|
159
|
+
|
160
|
+
attr1 :xpath, '//*[@class="bla"]', :int # convert value to int
|
161
|
+
attr2 :css, '.bla', :float # convert value to float
|
162
|
+
end
|
163
|
+
|
164
|
+
callbacks do
|
165
|
+
before_save do |result|
|
166
|
+
# result is a hash that will be saved to the db
|
167
|
+
# you can alter result before save
|
168
|
+
end
|
169
|
+
|
170
|
+
after_save do |result|
|
171
|
+
# result has been successfully saved to the db
|
172
|
+
end
|
173
|
+
|
174
|
+
save_if do |resource, page, result|
|
175
|
+
# if false or nil is returned record is not saved to the db
|
176
|
+
end
|
177
|
+
end
|
178
|
+
end
|
179
|
+
|
180
|
+
callbacks do
|
181
|
+
after_crawl do |resource, page|
|
182
|
+
# page has been successfully crawled
|
183
|
+
end
|
184
|
+
|
185
|
+
before_cache do |resource, page|
|
186
|
+
# if caching is enabled you can check page here
|
187
|
+
# by throwing exception you can interrupt caching and
|
188
|
+
# resource processing
|
189
|
+
end
|
190
|
+
|
191
|
+
store_if do
|
192
|
+
# if false or nil is returned page is not processed
|
193
|
+
end
|
194
|
+
end
|
195
|
+
end
|
196
|
+
```
|
197
|
+
|
132
198
|
Debugging
|
133
199
|
---------
|
134
200
|
|
data/lib/kabutops/recipe_item.rb
CHANGED
@@ -29,6 +29,8 @@ module Kabutops
|
|
29
29
|
page.xpath(@value).text.gsub(/\u00a0/, ' ').strip
|
30
30
|
when :lambda, :proc
|
31
31
|
@value.call(resource, page)
|
32
|
+
when :const, :static
|
33
|
+
@value
|
32
34
|
else
|
33
35
|
raise "unknown recipe item type '#{item.type}'"
|
34
36
|
end
|
@@ -45,5 +47,4 @@ module Kabutops
|
|
45
47
|
end
|
46
48
|
end
|
47
49
|
|
48
|
-
|
49
50
|
end
|
data/lib/kabutops/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: kabutops
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Rene Klacan
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-08-10 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: mechanize
|