kabutops 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +66 -0
- data/lib/kabutops/recipe_item.rb +2 -1
- data/lib/kabutops/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 47c19efbb1d1f231ba44e5d19b0b29e52e948348
|
4
|
+
data.tar.gz: e9b7acd6a3808155f82ae4d74c83afb7f208fb16
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c9a9704dd8c0009fe35dbc47d0199246f67f4f4435730ce28247bac895394780abc1a17cf1b68c7e42e6a7f4665dcd8a4fb468890588ad58ec973a5708de0f48
|
7
|
+
data.tar.gz: c56de35682752cf9b672a9aac4494a10d8edc94c801e5dedda9dfe3c29350c2907969eb8e100866131a27a52a28e73fdf4139511cf0097993b6a48ba682795f7
|
data/README.md
CHANGED
@@ -129,6 +129,72 @@ Documents saved in the ElasticSearch will look like this one
|
|
129
129
|
}
|
130
130
|
```
|
131
131
|
|
132
|
+
Advanced
|
133
|
+
--------
|
134
|
+
|
135
|
+
```ruby
|
136
|
+
class SomeCrawler < Kabutops::Crawler
|
137
|
+
collection [
|
138
|
+
{
|
139
|
+
id: 'some_id',
|
140
|
+
url: 'some_url.com/some_id',
|
141
|
+
},
|
142
|
+
]
|
143
|
+
agent ->{
|
144
|
+
# this call will be called before every request
|
145
|
+
# you should return agent that takes 'get' method
|
146
|
+
}
|
147
|
+
proxy 'proxy_host.com', 1234 # proxy host and port
|
148
|
+
wait 7 # wait X seconds after every crawl
|
149
|
+
skip_existing true # if :id exists in db resource wont't be crawled again
|
150
|
+
|
151
|
+
elasticsearch do
|
152
|
+
host 'some_host.com'
|
153
|
+
port 12345
|
154
|
+
index :name_of_index
|
155
|
+
type :type_of_es_doc
|
156
|
+
|
157
|
+
data each: 'xpath if multiple records are located on one site' do
|
158
|
+
# attrs
|
159
|
+
|
160
|
+
attr1 :xpath, '//*[@class="bla"]', :int # convert value to int
|
161
|
+
attr2 :css, '.bla', :float # convert value to float
|
162
|
+
end
|
163
|
+
|
164
|
+
callbacks do
|
165
|
+
before_save do |result|
|
166
|
+
# result is a hash that will be saved to the db
|
167
|
+
# you can alter result before save
|
168
|
+
end
|
169
|
+
|
170
|
+
after_save do |result|
|
171
|
+
# result has been successfully saved to the db
|
172
|
+
end
|
173
|
+
|
174
|
+
save_if do |resource, page, result|
|
175
|
+
# if false or nil is returned record is not saved to the db
|
176
|
+
end
|
177
|
+
end
|
178
|
+
end
|
179
|
+
|
180
|
+
callbacks do
|
181
|
+
after_crawl do |resource, page|
|
182
|
+
# page has been successfully crawled
|
183
|
+
end
|
184
|
+
|
185
|
+
before_cache do |resource, page|
|
186
|
+
# if caching is enabled you can check page here
|
187
|
+
# by throwing exception you can interrupt caching and
|
188
|
+
# resource processing
|
189
|
+
end
|
190
|
+
|
191
|
+
store_if do
|
192
|
+
# if false or nil is returned page is not processed
|
193
|
+
end
|
194
|
+
end
|
195
|
+
end
|
196
|
+
```
|
197
|
+
|
132
198
|
Debugging
|
133
199
|
---------
|
134
200
|
|
data/lib/kabutops/recipe_item.rb
CHANGED
@@ -29,6 +29,8 @@ module Kabutops
|
|
29
29
|
page.xpath(@value).text.gsub(/\u00a0/, ' ').strip
|
30
30
|
when :lambda, :proc
|
31
31
|
@value.call(resource, page)
|
32
|
+
when :const, :static
|
33
|
+
@value
|
32
34
|
else
|
33
35
|
raise "unknown recipe item type '#{item.type}'"
|
34
36
|
end
|
@@ -45,5 +47,4 @@ module Kabutops
|
|
45
47
|
end
|
46
48
|
end
|
47
49
|
|
48
|
-
|
49
50
|
end
|
data/lib/kabutops/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: kabutops
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Rene Klacan
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-08-10 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: mechanize
|