deepl_diff 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +13 -0
- data/.rspec +2 -0
- data/.rubocop.yml +15 -0
- data/.travis.yml +12 -0
- data/Gemfile +5 -0
- data/README.md +117 -0
- data/Rakefile +8 -0
- data/deepl_diff.gemspec +55 -0
- data/lib/deepl_diff/cache.rb +38 -0
- data/lib/deepl_diff/chunker.rb +56 -0
- data/lib/deepl_diff/linearizer.rb +29 -0
- data/lib/deepl_diff/redis_cache_store.rb +26 -0
- data/lib/deepl_diff/redis_rate_limiter.rb +22 -0
- data/lib/deepl_diff/request.rb +157 -0
- data/lib/deepl_diff/spacing.rb +31 -0
- data/lib/deepl_diff/tokenizer.rb +159 -0
- data/lib/deepl_diff/version.rb +5 -0
- data/lib/deepl_diff.rb +28 -0
- metadata +259 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 601d9219d37903d18d03a30e85fee9ce783522fbfb75d616f2873ef4ca6da950
|
4
|
+
data.tar.gz: a0b23c609b7f6f48b74a83a818d96d3bb13665568f3563a6e67fe2c16fb42f44
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: a739f33576f117fde32e6c6fd9527835a2f6aeb3fbdd4b3265aa4e0c913e519bc0f8bd536ed2abdcee50721823e75286aed909cbfc59fc889cea40538c83847c
|
7
|
+
data.tar.gz: 87beae80de1cf85db6308cc613e71b428b0008a4b797ef276ef4d16abd65f162aea2cfd09b454ddd6f2fb438ae3939af7c48c42e850f3f4772c2184be1c35839
|
data/.gitignore
ADDED
data/.rspec
ADDED
data/.rubocop.yml
ADDED
data/.travis.yml
ADDED
data/Gemfile
ADDED
data/README.md
ADDED
@@ -0,0 +1,117 @@
|
|
1
|
+
# DeepLDiff
|
2
|
+
|
3
|
+
DeepL API wrapper helps to translate only changes between revisions of long texts.
|
4
|
+
|
5
|
+
|
6
|
+
|
7
|
+
|
8
|
+
**DeepLDiff** based on [GoogleTranslateDiff](https://github.com/gzigzigzeo/google_translate_diff)
|
9
|
+
## Use case
|
10
|
+
|
11
|
+
Assume your project contains a significant amount of products descriptions which:
|
12
|
+
- Require retranslation each time user edits them.
|
13
|
+
- Have a lot of equal parts (like return policy).
|
14
|
+
- Change frequently.
|
15
|
+
|
16
|
+
If your user changes a single word within the long description, you will be charged for the retranslation of the whole text.
|
17
|
+
|
18
|
+
Much better approach is to try to translate every repeated structural element (sentence) in your texts array just once to save money. This gem helps to make it done.
|
19
|
+
|
20
|
+
## Installation
|
21
|
+
|
22
|
+
Add this line to your application's Gemfile:
|
23
|
+
|
24
|
+
```ruby
|
25
|
+
gem 'deepl_diff'
|
26
|
+
```
|
27
|
+
|
28
|
+
And then execute:
|
29
|
+
|
30
|
+
$ bundle
|
31
|
+
|
32
|
+
Or install it yourself as:
|
33
|
+
|
34
|
+
$ gem install deepl_diff
|
35
|
+
|
36
|
+
## Usage
|
37
|
+
|
38
|
+
```ruby
|
39
|
+
require "deepl_diff"
|
40
|
+
|
41
|
+
# This dependencies are not included, as you might need to roll your own cache based on different store
|
42
|
+
require "redis"
|
43
|
+
require "connection_pool"
|
44
|
+
require "redis-namespace"
|
45
|
+
require "ratelimit" # Optional, if you will use
|
46
|
+
|
47
|
+
# Setup https://github.com/wikiti/deepl-rb
|
48
|
+
DeepL.configure do |config|
|
49
|
+
config.auth_key = 'your-api-token'
|
50
|
+
config.host = 'https://api-free.deepl.com' # Default value is 'https://api.deepl.com'
|
51
|
+
config.version = 'v1' # Default value is 'v2'
|
52
|
+
end
|
53
|
+
|
54
|
+
# I always use pool for redis
|
55
|
+
pool = ConnectionPool.new(size: 10, timeout: 5) { Redis.new }
|
56
|
+
|
57
|
+
# Pass DeepL for DeepLDiff
|
58
|
+
DeepLDiff.api = DeepL
|
59
|
+
|
60
|
+
DeepLDiff.cache_store =
|
61
|
+
DeepLDiff::RedisCacheStore.new(pool, timeout: 7.days, namespace: "t")
|
62
|
+
|
63
|
+
# Optional
|
64
|
+
DeepLDiff.rate_limiter =
|
65
|
+
DeepLDiff::RedisRateLimiter.new(
|
66
|
+
pool, threshold: 8000, interval: 60, namespace: "t"
|
67
|
+
)
|
68
|
+
|
69
|
+
DeepLDiff.translate("test translations", from: "en", to: "es")
|
70
|
+
```
|
71
|
+
|
72
|
+
## How it works
|
73
|
+
|
74
|
+
- Text nodes are extracted from HTML.
|
75
|
+
- Every text node is split into sentences (using `punkt-segmenter` gem).
|
76
|
+
- Cache is checked for the presence of each sentence (using language couple and a hash of string).
|
77
|
+
- Missing sentences are translated via API and cached.
|
78
|
+
- Original HTML is recombined from translations and cache data.
|
79
|
+
|
80
|
+
*NOTE:* if `:from` is not specified or equal to nil, then the DeepL API will be called twice, the first time a sample of text up to 100 characters long will be transferred to determine the language, and the second time the entire text will be transferred.
|
81
|
+
Try to specify `:from` explicitly
|
82
|
+
|
83
|
+
## Input
|
84
|
+
|
85
|
+
`::translate` can receive string, array or deep hash and will return the same, but translated.
|
86
|
+
|
87
|
+
```ruby
|
88
|
+
DeepLDiff.translate("test", from: "en", to: "es")
|
89
|
+
DeepLDiff.translate(%w[test language], from: "en", to: "es")
|
90
|
+
DeepLDiff.translate(
|
91
|
+
{ title: "test", values: { type: "frequent" } }, from: "en", to: "es"
|
92
|
+
)
|
93
|
+
```
|
94
|
+
|
95
|
+
See `DeepLDiff::Linearizer` for details.
|
96
|
+
|
97
|
+
## HTML
|
98
|
+
|
99
|
+
You can pass HTML as like as plain text:
|
100
|
+
|
101
|
+
```ruby
|
102
|
+
DeepLDiff.translate("<b>Black</b>", from: "en", to: "es")
|
103
|
+
```
|
104
|
+
|
105
|
+
## Very long texts
|
106
|
+
|
107
|
+
DeepL API has a limitation: query can not be longer than approximately 128 KB. If your text is really that long, multiple queries will be used to translate it automatically.
|
108
|
+
|
109
|
+
## Development
|
110
|
+
|
111
|
+
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
112
|
+
|
113
|
+
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
114
|
+
|
115
|
+
## Contributing
|
116
|
+
|
117
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/wikiti/deepl-rb.
|
data/Rakefile
ADDED
data/deepl_diff.gemspec
ADDED
@@ -0,0 +1,55 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
lib = File.expand_path("lib", __dir__)
|
4
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
5
|
+
require "deepl_diff/version"
|
6
|
+
|
7
|
+
# rubocop:disable Metrics/BlockLength
|
8
|
+
Gem::Specification.new do |spec|
|
9
|
+
spec.name = "deepl_diff"
|
10
|
+
spec.version = DeepLDiff::VERSION
|
11
|
+
spec.authors = ["Islam Gagiev"]
|
12
|
+
spec.email = ["omniacinis@gmail.com"]
|
13
|
+
|
14
|
+
spec.summary = %(
|
15
|
+
DeepL API wrapper for Ruby which helps to translate only changes
|
16
|
+
between revisions of long texts.
|
17
|
+
|
18
|
+
)
|
19
|
+
spec.description = %(
|
20
|
+
DeepL API wrapper for Ruby which helps to translate only changes
|
21
|
+
between revisions of long texts.
|
22
|
+
)
|
23
|
+
spec.homepage = "https://github.com/Halvanhelv/deepl_diff"
|
24
|
+
|
25
|
+
if spec.respond_to?(:metadata)
|
26
|
+
spec.metadata["allowed_push_host"] = "https://rubygems.org"
|
27
|
+
else
|
28
|
+
raise "RubyGems 2.0 or newer is required to protect against " \
|
29
|
+
"public gem pushes."
|
30
|
+
end
|
31
|
+
|
32
|
+
spec.files = `git ls-files -z`.split("\x0").reject do |f|
|
33
|
+
f.match(%r{^(test|spec|features)/})
|
34
|
+
end
|
35
|
+
spec.bindir = "exe"
|
36
|
+
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
37
|
+
spec.require_paths = ["lib"]
|
38
|
+
|
39
|
+
spec.add_development_dependency "bundler", "~> 1.14"
|
40
|
+
spec.add_development_dependency "codeclimate-test-reporter", "~> 1.0.0"
|
41
|
+
spec.add_development_dependency "rake", "~> 10.0"
|
42
|
+
spec.add_development_dependency "rspec", "~> 3.0"
|
43
|
+
spec.add_development_dependency "rubocop"
|
44
|
+
spec.add_development_dependency "simplecov"
|
45
|
+
|
46
|
+
spec.add_dependency "connection_pool"
|
47
|
+
spec.add_dependency "deepl-rb"
|
48
|
+
spec.add_dependency "dry-initializer"
|
49
|
+
spec.add_dependency "ox"
|
50
|
+
spec.add_dependency "punkt-segmenter"
|
51
|
+
spec.add_dependency "ratelimit"
|
52
|
+
spec.add_dependency "redis"
|
53
|
+
spec.add_dependency "redis-namespace"
|
54
|
+
end
|
55
|
+
# rubocop:enable Metrics/BlockLength
|
@@ -0,0 +1,38 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class DeepLDiff::Cache
|
4
|
+
extend Dry::Initializer
|
5
|
+
|
6
|
+
param :from
|
7
|
+
param :to
|
8
|
+
|
9
|
+
def cached_and_missing(values)
|
10
|
+
keys = values.map { |v| key(v) }
|
11
|
+
cached = cache_store.read_multi(keys)
|
12
|
+
missing = values.map.with_index { |v, i| v if cached[i].nil? }.compact
|
13
|
+
|
14
|
+
[cached, missing]
|
15
|
+
end
|
16
|
+
|
17
|
+
def store(values, cached, updates)
|
18
|
+
cached.map.with_index do |value, index|
|
19
|
+
value || store_value(values[index], updates.shift)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
private
|
24
|
+
|
25
|
+
def store_value(value, translation)
|
26
|
+
cache_store.write(key(value), translation)
|
27
|
+
translation
|
28
|
+
end
|
29
|
+
|
30
|
+
def key(value)
|
31
|
+
hash = Digest::MD5.hexdigest(value.strip) # No matter how much spaces
|
32
|
+
"#{from}:#{to}:#{hash}"
|
33
|
+
end
|
34
|
+
|
35
|
+
def cache_store
|
36
|
+
DeepLDiff.cache_store
|
37
|
+
end
|
38
|
+
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class DeepLDiff::Chunker
|
4
|
+
extend ::Dry::Initializer
|
5
|
+
|
6
|
+
class Error < StandardError; end
|
7
|
+
|
8
|
+
Chunk = Struct.new(:values, :size)
|
9
|
+
|
10
|
+
param :values
|
11
|
+
option :limit, default: proc { MAX_CHUNK_SIZE }
|
12
|
+
option :count_limit, default: proc { COUNT_LIMIT }
|
13
|
+
|
14
|
+
def call
|
15
|
+
chunks.map(&:values)
|
16
|
+
end
|
17
|
+
|
18
|
+
def chunks
|
19
|
+
values.each_with_object([]) do |value, chunks|
|
20
|
+
validate_value_size(value)
|
21
|
+
|
22
|
+
tail = chunks.last
|
23
|
+
|
24
|
+
if next_chunk?(tail, value)
|
25
|
+
chunks << Chunk.new([], 0)
|
26
|
+
tail = chunks.last
|
27
|
+
end
|
28
|
+
|
29
|
+
update_chunk(tail, value)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
private
|
34
|
+
|
35
|
+
def next_chunk?(tail, value)
|
36
|
+
tail.nil? ||
|
37
|
+
(size(value) + tail.size > limit) ||
|
38
|
+
tail.values.size > count_limit
|
39
|
+
end
|
40
|
+
|
41
|
+
def size(text)
|
42
|
+
CGI.escape(text).size
|
43
|
+
end
|
44
|
+
|
45
|
+
def update_chunk(chunk, value)
|
46
|
+
chunk.values << value
|
47
|
+
chunk.size = chunk.size + value.size
|
48
|
+
end
|
49
|
+
|
50
|
+
def validate_value_size(value)
|
51
|
+
raise Error, "Too long part #{value.size} > #{limit}" if value.size > limit
|
52
|
+
end
|
53
|
+
|
54
|
+
MAX_CHUNK_SIZE = 1700
|
55
|
+
COUNT_LIMIT = 300
|
56
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class DeepLDiff::Linearizer
|
4
|
+
class << self
|
5
|
+
def linearize(struct, array = [])
|
6
|
+
case struct
|
7
|
+
when Hash
|
8
|
+
struct.each { |_k, v| linearize(v, array) }
|
9
|
+
when Array
|
10
|
+
struct.each { |v| linearize(v, array) }
|
11
|
+
else
|
12
|
+
array << struct
|
13
|
+
end
|
14
|
+
|
15
|
+
array
|
16
|
+
end
|
17
|
+
|
18
|
+
def restore(struct, array)
|
19
|
+
case struct
|
20
|
+
when Hash
|
21
|
+
struct.transform_values { |v| restore(v, array) }
|
22
|
+
when Array
|
23
|
+
struct.map { |v| restore(v, array) }
|
24
|
+
else
|
25
|
+
array.shift
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class DeepLDiff::RedisCacheStore
|
4
|
+
extend Dry::Initializer
|
5
|
+
|
6
|
+
param :connection_pool
|
7
|
+
|
8
|
+
option :timeout, default: proc { 60 * 60 * 24 * 7 }
|
9
|
+
option :namespace, default: proc { DeepLDiff::CACHE_NAMESPACE }
|
10
|
+
|
11
|
+
def read_multi(keys)
|
12
|
+
redis { |redis| redis.mget(*keys) }
|
13
|
+
end
|
14
|
+
|
15
|
+
def write(key, value)
|
16
|
+
redis { |redis| redis.setex(key, timeout, value) }
|
17
|
+
end
|
18
|
+
|
19
|
+
private
|
20
|
+
|
21
|
+
def redis
|
22
|
+
connection_pool.with do |redis|
|
23
|
+
yield Redis::Namespace.new(namespace, redis: redis)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class DeepLDiff::RedisRateLimiter
|
4
|
+
extend Dry::Initializer
|
5
|
+
|
6
|
+
class RateLimitExceeded < StandardError; end
|
7
|
+
|
8
|
+
param :connection_pool
|
9
|
+
param :threshold, default: proc { 8000 }
|
10
|
+
param :interval, default: proc { 60 }
|
11
|
+
|
12
|
+
option :namespace, default: proc { DeepLDiff::CACHE_NAMESPACE }
|
13
|
+
|
14
|
+
def check(size)
|
15
|
+
connection_pool.with do |redis|
|
16
|
+
rate_limit = Ratelimit.new(namespace, redis: redis)
|
17
|
+
raise RateLimitExceeded if rate_limit.exceeded?("call", threshold: threshold, interval: interval)
|
18
|
+
|
19
|
+
rate_limit.add size
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,157 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class DeepLDiff::Request
|
4
|
+
extend Dry::Initializer
|
5
|
+
extend Forwardable
|
6
|
+
|
7
|
+
param :values
|
8
|
+
param :options
|
9
|
+
|
10
|
+
def_delegators :DeepLDiff, :api, :cache_store, :rate_limiter
|
11
|
+
def_delegators :"DeepLDiff::Linearizer", :linearize, :restore
|
12
|
+
|
13
|
+
def call
|
14
|
+
validate_globals
|
15
|
+
|
16
|
+
return values if from == to || values.empty?
|
17
|
+
|
18
|
+
translation
|
19
|
+
end
|
20
|
+
|
21
|
+
private
|
22
|
+
|
23
|
+
def from
|
24
|
+
@from ||= options.delete(:from) || detect_language
|
25
|
+
end
|
26
|
+
|
27
|
+
def to
|
28
|
+
@to ||= options.delete(:to) { nil }
|
29
|
+
end
|
30
|
+
|
31
|
+
def detect_language
|
32
|
+
api.translate(text_tokens_texts.join(" ")[0..100], nil, to)
|
33
|
+
.detected_source_language.downcase
|
34
|
+
end
|
35
|
+
|
36
|
+
def validate_globals
|
37
|
+
raise "Set DeepLDiff.api before calling ::translate" unless api
|
38
|
+
return if cache_store
|
39
|
+
|
40
|
+
raise "Set DeepLDiff.cache_store before calling ::translate"
|
41
|
+
end
|
42
|
+
|
43
|
+
# Extracts flat text array
|
44
|
+
# => "Name", "<b>Good</b> boy"
|
45
|
+
#
|
46
|
+
# #values might be something like { name: "Name", bio: "<b>Good</b> boy" }
|
47
|
+
def texts
|
48
|
+
@texts ||= linearize(values)
|
49
|
+
end
|
50
|
+
|
51
|
+
# Converts each array item to token list
|
52
|
+
# => [..., [["<b>", :markup], ["Good", :text], ...]]
|
53
|
+
def tokens
|
54
|
+
@tokens ||= texts.map do |value|
|
55
|
+
DeepLDiff::Tokenizer.tokenize(value)
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
# Extracts text tokens from token list
|
60
|
+
# => { ..., "1_1" => "Good", 1_3 => "Boy", ... }
|
61
|
+
def text_tokens
|
62
|
+
@text_tokens ||= extract_text_tokens.to_h
|
63
|
+
end
|
64
|
+
|
65
|
+
def extract_text_tokens
|
66
|
+
tokens.each_with_object([]).with_index do |(group, result), group_index|
|
67
|
+
group.each_with_index do |(value, type), index|
|
68
|
+
result << ["#{group_index}_#{index}", value] if type == :text
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
# Extracts values from text tokens
|
74
|
+
# => [ ..., "Good", "Boy", ... ]
|
75
|
+
def text_tokens_texts
|
76
|
+
@text_tokens_texts ||= linearize(text_tokens).map(&:to_s).map(&:strip)
|
77
|
+
end
|
78
|
+
|
79
|
+
# Splits things requires translations to per-request chunks
|
80
|
+
# (groups less 2k sym)
|
81
|
+
# => [[ ..., "Good", "Boy", ... ]]
|
82
|
+
def chunks
|
83
|
+
@chunks ||= DeepLDiff::Chunker.new(text_tokens_texts).call
|
84
|
+
end
|
85
|
+
|
86
|
+
# Translates/loads from cache values from each chunk
|
87
|
+
# => [[ ..., "Horoshiy", "Malchik", ... ]]
|
88
|
+
def chunks_translated
|
89
|
+
@chunks_translated ||= chunks.map do |chunk|
|
90
|
+
cached, missing = cache.cached_and_missing(chunk)
|
91
|
+
if missing.empty?
|
92
|
+
cached
|
93
|
+
else
|
94
|
+
cache.store(chunk, cached, call_api(missing))
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
# Restores indexes for translated tokens
|
100
|
+
# => { ..., "1_1" => "Horoshiy", 1_3 => "Malchik", ... }
|
101
|
+
def text_tokens_translated
|
102
|
+
@text_tokens_texts_translated ||=
|
103
|
+
restore(text_tokens, chunks_translated.flatten)
|
104
|
+
end
|
105
|
+
|
106
|
+
# Restores tokens translated + adds same spacing as in source token
|
107
|
+
# => [[..., [ "Horoshiy", :text ], ...]]
|
108
|
+
# rubocop:disable Metrics/AbcSize
|
109
|
+
def tokens_translated
|
110
|
+
@tokens_translated ||= tokens.dup.tap do |tokens|
|
111
|
+
text_tokens_translated.each do |index, value|
|
112
|
+
group_index, index = index.split("_")
|
113
|
+
tokens[group_index.to_i][index.to_i][0] =
|
114
|
+
restore_spacing(tokens[group_index.to_i][index.to_i][0], value)
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
118
|
+
# rubocop:enable Metrics/AbcSize
|
119
|
+
|
120
|
+
def restore_spacing(source_value, value)
|
121
|
+
DeepLDiff::Spacing.restore(source_value, value)
|
122
|
+
end
|
123
|
+
|
124
|
+
# Restores texts from tokens
|
125
|
+
# [..., "<b>Horoshiy</b> Malchik", ...]
|
126
|
+
def texts_translated
|
127
|
+
@texts_translated ||= tokens_translated.map do |group|
|
128
|
+
group.map { |value, type| type == :text ? value : fix_ascii(value) }.join
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
# Final result
|
133
|
+
def translation
|
134
|
+
@translation ||= restore(values, texts_translated)
|
135
|
+
end
|
136
|
+
|
137
|
+
def call_api(values)
|
138
|
+
check_rate_limit(values)
|
139
|
+
[api.translate(values, from, to, **options)].flatten.map(&:text)
|
140
|
+
end
|
141
|
+
|
142
|
+
def cache
|
143
|
+
@cache ||= DeepLDiff::Cache.new(from, to)
|
144
|
+
end
|
145
|
+
|
146
|
+
def check_rate_limit(values)
|
147
|
+
return if rate_limiter.nil?
|
148
|
+
|
149
|
+
size = values.map(&:size).inject(0) { |sum, x| sum + x }
|
150
|
+
rate_limiter.check(size)
|
151
|
+
end
|
152
|
+
|
153
|
+
# Markup should not contain control characters
|
154
|
+
def fix_ascii(value)
|
155
|
+
value.gsub(/[\u0000-\u001F]/, " ")
|
156
|
+
end
|
157
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Adds same count leading-trailing spaces left has to the right
|
4
|
+
class DeepLDiff::Spacing
|
5
|
+
class << self
|
6
|
+
# DeepLDiff::Spacing.restore(" a ", "Z") # => " Z "
|
7
|
+
def restore(left, right)
|
8
|
+
leading(left) + right.strip + trailing(left)
|
9
|
+
end
|
10
|
+
|
11
|
+
private
|
12
|
+
|
13
|
+
def spaces(count)
|
14
|
+
([" "] * count).join
|
15
|
+
end
|
16
|
+
|
17
|
+
def leading(value)
|
18
|
+
pos = value =~ /[^[:space:]]+/ui
|
19
|
+
return "" if pos.nil? || pos.zero?
|
20
|
+
|
21
|
+
value[0..(pos - 1)]
|
22
|
+
end
|
23
|
+
|
24
|
+
def trailing(value)
|
25
|
+
pos = value =~ /[[:space:]]+\z/ui
|
26
|
+
return "" if pos.nil?
|
27
|
+
|
28
|
+
value[pos..]
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,159 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class DeepLDiff::Tokenizer < ::Ox::Sax
|
4
|
+
def initialize(source)
|
5
|
+
@pos = nil
|
6
|
+
@source = source
|
7
|
+
@tokens = nil
|
8
|
+
@context = []
|
9
|
+
@sequence = []
|
10
|
+
@indicies = []
|
11
|
+
end
|
12
|
+
|
13
|
+
def instruct(target)
|
14
|
+
start_markup(target)
|
15
|
+
end
|
16
|
+
|
17
|
+
def end_instruct(target)
|
18
|
+
end_markup(target)
|
19
|
+
end
|
20
|
+
|
21
|
+
def start_element(name)
|
22
|
+
start_markup(name)
|
23
|
+
end
|
24
|
+
|
25
|
+
def end_element(name)
|
26
|
+
end_markup(name)
|
27
|
+
end
|
28
|
+
|
29
|
+
def attr(name, value)
|
30
|
+
return unless @context.last == :span
|
31
|
+
return unless name == :class && value == "notranslate"
|
32
|
+
return if notranslate?
|
33
|
+
|
34
|
+
@sequence[-1] = :notranslate
|
35
|
+
end
|
36
|
+
|
37
|
+
def text(value)
|
38
|
+
return if value == ""
|
39
|
+
|
40
|
+
@sequence << (SKIP.include?(@context.last) ? :markup : :text)
|
41
|
+
@indicies << @pos - 1
|
42
|
+
end
|
43
|
+
|
44
|
+
def tokens
|
45
|
+
@tokens ||= token_sequences_joined
|
46
|
+
.tap { |tokens| make_sentences_from_last_token(tokens) }
|
47
|
+
end
|
48
|
+
|
49
|
+
private
|
50
|
+
|
51
|
+
def token_sequences_joined
|
52
|
+
raw_tokens.each_with_object([]) do |token, tokens|
|
53
|
+
if tokens.empty? # Initial state
|
54
|
+
tokens << token
|
55
|
+
elsif tokens.last[1] == token[1]
|
56
|
+
# Join series of tokens of the same type into one
|
57
|
+
tokens.last[0].concat(token[0])
|
58
|
+
else
|
59
|
+
# If token before :markup is :text we need to split it into sentences
|
60
|
+
make_sentences_from_last_token(tokens)
|
61
|
+
tokens << token
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
def make_sentences_from_last_token(tokens)
|
67
|
+
return if tokens.empty?
|
68
|
+
|
69
|
+
tokens.concat(sentences(tokens.pop[0])) if tokens.last[1] == :text
|
70
|
+
end
|
71
|
+
|
72
|
+
# rubocop: disable Metrics/MethodLength
|
73
|
+
def sentences(value)
|
74
|
+
return [] if value.strip.empty?
|
75
|
+
|
76
|
+
boundaries =
|
77
|
+
Punkt::SentenceTokenizer
|
78
|
+
.new(value)
|
79
|
+
.sentences_from_text(value)
|
80
|
+
|
81
|
+
return [[value, :text]] if boundaries.size == 1
|
82
|
+
|
83
|
+
boundaries.map.with_index do |(left, right), index|
|
84
|
+
next_boundary = boundaries[index + 1]
|
85
|
+
right = next_boundary[0] - 1 if next_boundary
|
86
|
+
|
87
|
+
[value[left..right], :text]
|
88
|
+
end
|
89
|
+
end
|
90
|
+
# rubocop:enable Metrics/MethodLength
|
91
|
+
|
92
|
+
# Whether the sequence is between `:notranslate` and `:end_notranslate`
|
93
|
+
def notranslate?
|
94
|
+
@sequence.select { |item| item[/notranslate/] }.last == :notranslate
|
95
|
+
end
|
96
|
+
|
97
|
+
# Returns the item for last opened span
|
98
|
+
def end_span
|
99
|
+
return :markup unless notranslate?
|
100
|
+
|
101
|
+
opened_spans = @sequence
|
102
|
+
.reverse
|
103
|
+
.take_while { |item| item != :notranslate }
|
104
|
+
.map { |item| { span: 1, end_span: -1 }.fetch(item, 0) }
|
105
|
+
.reduce(0, :+)
|
106
|
+
|
107
|
+
opened_spans.positive? ? :end_span : :end_notranslate
|
108
|
+
end
|
109
|
+
|
110
|
+
def raw_tokens
|
111
|
+
@raw_tokens ||= @indicies.map.with_index do |i, n|
|
112
|
+
first = i
|
113
|
+
last = (@indicies[n + 1] || 0) - 1
|
114
|
+
value = fix_utf(@source.byteslice(first..last))
|
115
|
+
type = @sequence[n]
|
116
|
+
type = :text if INNER_SPANS.include?(type)
|
117
|
+
[value, type]
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
def fix_utf(value)
|
122
|
+
value.encode("UTF-8", undef: :replace, invalid: :replace, replace: " ")
|
123
|
+
end
|
124
|
+
|
125
|
+
def start_markup(name)
|
126
|
+
@context << name
|
127
|
+
@sequence << (if notranslate?
|
128
|
+
name == :span ? :span : :text
|
129
|
+
else
|
130
|
+
:markup
|
131
|
+
end)
|
132
|
+
@indicies << @pos - 1
|
133
|
+
end
|
134
|
+
|
135
|
+
def end_markup(name)
|
136
|
+
@context.pop
|
137
|
+
@sequence << (if notranslate?
|
138
|
+
name == :span ? end_span : :text
|
139
|
+
else
|
140
|
+
:markup
|
141
|
+
end)
|
142
|
+
@indicies << @pos - 1 unless @pos == @source.bytesize
|
143
|
+
end
|
144
|
+
|
145
|
+
class << self
|
146
|
+
def tokenize(value)
|
147
|
+
return [] if value.nil?
|
148
|
+
|
149
|
+
tokenizer = new(value).tap do |h|
|
150
|
+
Ox.sax_parse(h, StringIO.new(value), HTML_OPTIONS)
|
151
|
+
end
|
152
|
+
tokenizer.tokens
|
153
|
+
end
|
154
|
+
end
|
155
|
+
|
156
|
+
SKIP = %i[script style].freeze
|
157
|
+
INNER_SPANS = %i[notranslate span end_span end_notranslate].freeze
|
158
|
+
HTML_OPTIONS = { smart: true, skip: :skip_none }.freeze
|
159
|
+
end
|
data/lib/deepl_diff.rb
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "ox"
|
4
|
+
require "punkt-segmenter"
|
5
|
+
require "dry/initializer"
|
6
|
+
require "deepl"
|
7
|
+
|
8
|
+
require "deepl_diff/version"
|
9
|
+
require "deepl_diff/tokenizer"
|
10
|
+
require "deepl_diff/linearizer"
|
11
|
+
require "deepl_diff/chunker"
|
12
|
+
require "deepl_diff/spacing"
|
13
|
+
require "deepl_diff/cache"
|
14
|
+
require "deepl_diff/redis_cache_store"
|
15
|
+
require "deepl_diff/redis_rate_limiter"
|
16
|
+
require "deepl_diff/request"
|
17
|
+
|
18
|
+
module DeepLDiff
|
19
|
+
class << self
|
20
|
+
attr_accessor :api, :cache_store, :rate_limiter
|
21
|
+
|
22
|
+
def translate(*args)
|
23
|
+
Request.new(*args).call
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
CACHE_NAMESPACE = "deepl-diff"
|
28
|
+
end
|
metadata
ADDED
@@ -0,0 +1,259 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: deepl_diff
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.2
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Islam Gagiev
|
8
|
+
autorequire:
|
9
|
+
bindir: exe
|
10
|
+
cert_chain: []
|
11
|
+
date: 2023-02-15 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: bundler
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.14'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.14'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: codeclimate-test-reporter
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 1.0.0
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: 1.0.0
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rake
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '10.0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '10.0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: rspec
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '3.0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '3.0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: rubocop
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: simplecov
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - ">="
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: connection_pool
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - ">="
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
type: :runtime
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - ">="
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0'
|
111
|
+
- !ruby/object:Gem::Dependency
|
112
|
+
name: deepl-rb
|
113
|
+
requirement: !ruby/object:Gem::Requirement
|
114
|
+
requirements:
|
115
|
+
- - ">="
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '0'
|
118
|
+
type: :runtime
|
119
|
+
prerelease: false
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
requirements:
|
122
|
+
- - ">="
|
123
|
+
- !ruby/object:Gem::Version
|
124
|
+
version: '0'
|
125
|
+
- !ruby/object:Gem::Dependency
|
126
|
+
name: dry-initializer
|
127
|
+
requirement: !ruby/object:Gem::Requirement
|
128
|
+
requirements:
|
129
|
+
- - ">="
|
130
|
+
- !ruby/object:Gem::Version
|
131
|
+
version: '0'
|
132
|
+
type: :runtime
|
133
|
+
prerelease: false
|
134
|
+
version_requirements: !ruby/object:Gem::Requirement
|
135
|
+
requirements:
|
136
|
+
- - ">="
|
137
|
+
- !ruby/object:Gem::Version
|
138
|
+
version: '0'
|
139
|
+
- !ruby/object:Gem::Dependency
|
140
|
+
name: ox
|
141
|
+
requirement: !ruby/object:Gem::Requirement
|
142
|
+
requirements:
|
143
|
+
- - ">="
|
144
|
+
- !ruby/object:Gem::Version
|
145
|
+
version: '0'
|
146
|
+
type: :runtime
|
147
|
+
prerelease: false
|
148
|
+
version_requirements: !ruby/object:Gem::Requirement
|
149
|
+
requirements:
|
150
|
+
- - ">="
|
151
|
+
- !ruby/object:Gem::Version
|
152
|
+
version: '0'
|
153
|
+
- !ruby/object:Gem::Dependency
|
154
|
+
name: punkt-segmenter
|
155
|
+
requirement: !ruby/object:Gem::Requirement
|
156
|
+
requirements:
|
157
|
+
- - ">="
|
158
|
+
- !ruby/object:Gem::Version
|
159
|
+
version: '0'
|
160
|
+
type: :runtime
|
161
|
+
prerelease: false
|
162
|
+
version_requirements: !ruby/object:Gem::Requirement
|
163
|
+
requirements:
|
164
|
+
- - ">="
|
165
|
+
- !ruby/object:Gem::Version
|
166
|
+
version: '0'
|
167
|
+
- !ruby/object:Gem::Dependency
|
168
|
+
name: ratelimit
|
169
|
+
requirement: !ruby/object:Gem::Requirement
|
170
|
+
requirements:
|
171
|
+
- - ">="
|
172
|
+
- !ruby/object:Gem::Version
|
173
|
+
version: '0'
|
174
|
+
type: :runtime
|
175
|
+
prerelease: false
|
176
|
+
version_requirements: !ruby/object:Gem::Requirement
|
177
|
+
requirements:
|
178
|
+
- - ">="
|
179
|
+
- !ruby/object:Gem::Version
|
180
|
+
version: '0'
|
181
|
+
- !ruby/object:Gem::Dependency
|
182
|
+
name: redis
|
183
|
+
requirement: !ruby/object:Gem::Requirement
|
184
|
+
requirements:
|
185
|
+
- - ">="
|
186
|
+
- !ruby/object:Gem::Version
|
187
|
+
version: '0'
|
188
|
+
type: :runtime
|
189
|
+
prerelease: false
|
190
|
+
version_requirements: !ruby/object:Gem::Requirement
|
191
|
+
requirements:
|
192
|
+
- - ">="
|
193
|
+
- !ruby/object:Gem::Version
|
194
|
+
version: '0'
|
195
|
+
- !ruby/object:Gem::Dependency
|
196
|
+
name: redis-namespace
|
197
|
+
requirement: !ruby/object:Gem::Requirement
|
198
|
+
requirements:
|
199
|
+
- - ">="
|
200
|
+
- !ruby/object:Gem::Version
|
201
|
+
version: '0'
|
202
|
+
type: :runtime
|
203
|
+
prerelease: false
|
204
|
+
version_requirements: !ruby/object:Gem::Requirement
|
205
|
+
requirements:
|
206
|
+
- - ">="
|
207
|
+
- !ruby/object:Gem::Version
|
208
|
+
version: '0'
|
209
|
+
description: "\nDeepL API wrapper for Ruby which helps to translate only changes\nbetween
|
210
|
+
revisions of long texts.\n "
|
211
|
+
email:
|
212
|
+
- omniacinis@gmail.com
|
213
|
+
executables: []
|
214
|
+
extensions: []
|
215
|
+
extra_rdoc_files: []
|
216
|
+
files:
|
217
|
+
- ".gitignore"
|
218
|
+
- ".rspec"
|
219
|
+
- ".rubocop.yml"
|
220
|
+
- ".travis.yml"
|
221
|
+
- Gemfile
|
222
|
+
- README.md
|
223
|
+
- Rakefile
|
224
|
+
- deepl_diff.gemspec
|
225
|
+
- lib/deepl_diff.rb
|
226
|
+
- lib/deepl_diff/cache.rb
|
227
|
+
- lib/deepl_diff/chunker.rb
|
228
|
+
- lib/deepl_diff/linearizer.rb
|
229
|
+
- lib/deepl_diff/redis_cache_store.rb
|
230
|
+
- lib/deepl_diff/redis_rate_limiter.rb
|
231
|
+
- lib/deepl_diff/request.rb
|
232
|
+
- lib/deepl_diff/spacing.rb
|
233
|
+
- lib/deepl_diff/tokenizer.rb
|
234
|
+
- lib/deepl_diff/version.rb
|
235
|
+
homepage: https://github.com/Halvanhelv/deepl_diff
|
236
|
+
licenses: []
|
237
|
+
metadata:
|
238
|
+
allowed_push_host: https://rubygems.org
|
239
|
+
post_install_message:
|
240
|
+
rdoc_options: []
|
241
|
+
require_paths:
|
242
|
+
- lib
|
243
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
244
|
+
requirements:
|
245
|
+
- - ">="
|
246
|
+
- !ruby/object:Gem::Version
|
247
|
+
version: '0'
|
248
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
249
|
+
requirements:
|
250
|
+
- - ">="
|
251
|
+
- !ruby/object:Gem::Version
|
252
|
+
version: '0'
|
253
|
+
requirements: []
|
254
|
+
rubygems_version: 3.1.6
|
255
|
+
signing_key:
|
256
|
+
specification_version: 4
|
257
|
+
summary: DeepL API wrapper for Ruby which helps to translate only changes between
|
258
|
+
revisions of long texts.
|
259
|
+
test_files: []
|