tanakai 1.6.0 → 1.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/CHANGELOG.md +19 -6
- data/README.md +6 -0
- data/lib/tanakai/base/saver.rb +4 -6
- data/lib/tanakai/base.rb +5 -5
- data/lib/tanakai/cli/ansible_command_builder.rb +1 -1
- data/lib/tanakai/cli/generator.rb +1 -1
- data/lib/tanakai/cli.rb +1 -1
- data/lib/tanakai/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f639e8c843d1effdd2fa5268fe01ee0fda5adcb741ecda49b5e8f2c8a51f55a3
|
4
|
+
data.tar.gz: d03287426dc9e1e802ef149ad9edf5f83813ea067d22f9f6fb8fa7ec33bb3c5a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 12b9a122343c1599c87caf97cd527bf98c83db50e4f5fab40b4657932c41b41c5f3437a26297141a2e19f064f5083ba9b12c099d4bc20f537b0cf440aa92d9e2
|
7
|
+
data.tar.gz: 756e98178ef2c1fe80d9dfca936eab248e46e1c199665c23973fa9b96ad513a9bad2e4d0498e6ea3d3442a3aa76bf744109d91161e39c89aa0b29cdf83385ed0
|
data/.gitignore
CHANGED
data/CHANGELOG.md
CHANGED
@@ -1,18 +1,32 @@
|
|
1
1
|
# CHANGELOG
|
2
2
|
|
3
|
+
## Next
|
4
|
+
* Your contribution here
|
5
|
+
|
6
|
+
## 1.7.1
|
7
|
+
### Fixes
|
8
|
+
* [#5](https://github.com/glaucocustodio/tanakai/pull/5): Replace `File.exists`/`Dir.exists` that have been removed on ruby 3.2 - [MrChriss](https://github.com/MrChriss)
|
9
|
+
|
10
|
+
## 1.7.0
|
11
|
+
### New
|
12
|
+
* Allow passing `data:` to `crawl!` - [glaucocustodio](https://github.com/glaucocustodio)
|
13
|
+
|
14
|
+
### Fixes
|
15
|
+
* [#4](https://github.com/glaucocustodio/tanakai/pull/4): Fix keyword args on `crawl!` - [milk1000cc](https://github.com/milk1000cc)
|
16
|
+
|
3
17
|
## 1.6.0
|
4
18
|
### New
|
5
|
-
* Add support to Ruby 3
|
19
|
+
* Add support to Ruby 3 - [glaucocustodio](https://github.com/glaucocustodio)
|
6
20
|
|
7
21
|
## 1.5.1
|
8
22
|
### New
|
9
|
-
* Add `response_type` to `in_parallel`
|
23
|
+
* Add `response_type` to `in_parallel` - [glaucocustodio](https://github.com/glaucocustodio)
|
10
24
|
|
11
25
|
## 1.5.0
|
12
26
|
### New
|
13
|
-
* First release as Tanakai
|
14
|
-
* Add support to [Apparition](https://github.com/twalpole/apparition)
|
15
|
-
* Add support to [Cuprite](https://github.com/rubycdp/cuprite)
|
27
|
+
* First release as Tanakai - [glaucocustodio](https://github.com/glaucocustodio)
|
28
|
+
* Add support to [Apparition](https://github.com/twalpole/apparition) - [glaucocustodio](https://github.com/glaucocustodio)
|
29
|
+
* Add support to [Cuprite](https://github.com/rubycdp/cuprite) - [glaucocustodio](https://github.com/glaucocustodio)
|
16
30
|
|
17
31
|
## 1.4.0
|
18
32
|
### New
|
@@ -120,7 +134,6 @@
|
|
120
134
|
* Fix Mechanize::Driver#proxy (there was a bug while using proxy for mechanize engine without authorization)
|
121
135
|
* Fix requests retries logic
|
122
136
|
|
123
|
-
|
124
137
|
## 1.0.1
|
125
138
|
* Add missing `logger` method to pipeline
|
126
139
|
* Fix `set_proxy` in Mechanize and Poltergeist builders
|
data/README.md
CHANGED
@@ -1355,6 +1355,12 @@ end # =>
|
|
1355
1355
|
# {:spider_name=>"example_spider", :status=>:completed, :environment=>"development", :start_time=>2018-08-22 18:49:22 +0400, :stop_time=>2018-08-22 18:49:23 +0400, :running_time=>0.801, :visits=>{:requests=>1, :responses=>1}, :items=>{:sent=>0, :processed=>0}, :error=>nil}
|
1356
1356
|
```
|
1357
1357
|
|
1358
|
+
You can also pass `data` to `crawl!`:
|
1359
|
+
|
1360
|
+
```ruby
|
1361
|
+
ExampleSpider.crawl!(data: { foo: "bar" })
|
1362
|
+
```
|
1363
|
+
|
1358
1364
|
So what if you're don't care about stats and just want to process request to a particular spider method and get the returning value from this method? Use `.parse!` instead:
|
1359
1365
|
|
1360
1366
|
#### `.parse!(:method_name, url:)` method
|
data/lib/tanakai/base/saver.rb
CHANGED
@@ -42,7 +42,7 @@ module Tanakai
|
|
42
42
|
def save_to_json(item)
|
43
43
|
data = JSON.generate([item])
|
44
44
|
|
45
|
-
if @index > 1 || append && File.
|
45
|
+
if @index > 1 || append && File.exist?(path)
|
46
46
|
file_content = File.read(path).sub(/\}\]\Z/, "\}\,")
|
47
47
|
File.open(path, "w") do |f|
|
48
48
|
f.write(file_content + data.sub(/\A\[/, ""))
|
@@ -55,7 +55,7 @@ module Tanakai
|
|
55
55
|
def save_to_pretty_json(item)
|
56
56
|
data = JSON.pretty_generate([item])
|
57
57
|
|
58
|
-
if @index > 1 || append && File.
|
58
|
+
if @index > 1 || append && File.exist?(path)
|
59
59
|
file_content = File.read(path).sub(/\}\n\]\Z/, "\}\,\n")
|
60
60
|
File.open(path, "w") do |f|
|
61
61
|
f.write(file_content + data.sub(/\A\[\n/, ""))
|
@@ -68,7 +68,7 @@ module Tanakai
|
|
68
68
|
def save_to_jsonlines(item)
|
69
69
|
data = JSON.generate(item)
|
70
70
|
|
71
|
-
if @index > 1 || append && File.
|
71
|
+
if @index > 1 || append && File.exist?(path)
|
72
72
|
File.open(path, "a") { |file| file.write("\n" + data) }
|
73
73
|
else
|
74
74
|
File.open(path, "w") { |file| file.write(data) }
|
@@ -78,7 +78,7 @@ module Tanakai
|
|
78
78
|
def save_to_csv(item)
|
79
79
|
data = flatten_hash(item)
|
80
80
|
|
81
|
-
if @index > 1 || append && File.
|
81
|
+
if @index > 1 || append && File.exist?(path)
|
82
82
|
CSV.open(path, "a+", force_quotes: true) do |csv|
|
83
83
|
csv << data.values
|
84
84
|
end
|
@@ -102,5 +102,3 @@ module Tanakai
|
|
102
102
|
end
|
103
103
|
end
|
104
104
|
end
|
105
|
-
|
106
|
-
|
data/lib/tanakai/base.rb
CHANGED
@@ -100,7 +100,7 @@ module Tanakai
|
|
100
100
|
end
|
101
101
|
end
|
102
102
|
|
103
|
-
def self.crawl!(exception_on_fail: true)
|
103
|
+
def self.crawl!(exception_on_fail: true, data: {})
|
104
104
|
logger.error "Spider: already running: #{name}" and return false if running?
|
105
105
|
|
106
106
|
@storage = Storage.new
|
@@ -124,13 +124,13 @@ module Tanakai
|
|
124
124
|
if start_urls
|
125
125
|
start_urls.each do |start_url|
|
126
126
|
if start_url.class == Hash
|
127
|
-
spider.request_to(:parse, start_url)
|
127
|
+
spider.request_to(:parse, url: start_url, data: data)
|
128
128
|
else
|
129
|
-
spider.request_to(:parse, url: start_url)
|
129
|
+
spider.request_to(:parse, url: start_url, data: data)
|
130
130
|
end
|
131
131
|
end
|
132
132
|
else
|
133
|
-
spider.parse
|
133
|
+
spider.parse(data: data)
|
134
134
|
end
|
135
135
|
rescue StandardError, SignalException, SystemExit => e
|
136
136
|
@run_info.merge!(status: :failed, error: e.inspect)
|
@@ -160,7 +160,7 @@ module Tanakai
|
|
160
160
|
if args.present?
|
161
161
|
spider.public_send(handler, *args)
|
162
162
|
elsif request.present?
|
163
|
-
spider.request_to(handler, request)
|
163
|
+
spider.request_to(handler, **request)
|
164
164
|
else
|
165
165
|
spider.public_send(handler)
|
166
166
|
end
|
@@ -31,7 +31,7 @@ module Tanakai
|
|
31
31
|
"--extra-vars", "ansible_python_interpreter=/usr/bin/python3"
|
32
32
|
]
|
33
33
|
|
34
|
-
if File.
|
34
|
+
if File.exist? "config/automation.yml"
|
35
35
|
require 'yaml'
|
36
36
|
if config = YAML.load_file("config/automation.yml").dig(@playbook)
|
37
37
|
config.each { |key, value| @vars[key] = value unless @vars[key] }
|
@@ -17,7 +17,7 @@ module Tanakai
|
|
17
17
|
|
18
18
|
def generate_spider(spider_name, in_project:)
|
19
19
|
spider_path = in_project ? "spiders/#{spider_name}.rb" : "./#{spider_name}.rb"
|
20
|
-
raise "Spider #{spider_path} already
|
20
|
+
raise "Spider #{spider_path} already exist?" if File.exist? spider_path
|
21
21
|
|
22
22
|
spider_class = to_spider_class(spider_name)
|
23
23
|
create_file spider_path do
|
data/lib/tanakai/cli.rb
CHANGED
data/lib/tanakai/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tanakai
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.7.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Victor Afanasev
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: exe
|
11
11
|
cert_chain: []
|
12
|
-
date: 2023-02
|
12
|
+
date: 2023-11-02 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: thor
|