abroad 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/README.md +142 -0
- data/abroad.gemspec +25 -0
- data/lib/abroad/extractors/extractor.rb +46 -0
- data/lib/abroad/extractors/json/json_extractor.rb +17 -0
- data/lib/abroad/extractors/json/key_value_extractor.rb +38 -0
- data/lib/abroad/extractors/json.rb +8 -0
- data/lib/abroad/extractors/xml/android_extractor.rb +113 -0
- data/lib/abroad/extractors/xml/xml_extractor.rb +28 -0
- data/lib/abroad/extractors/xml.rb +8 -0
- data/lib/abroad/extractors/yaml/dotted_key_extractor.rb +40 -0
- data/lib/abroad/extractors/yaml/jruby_compat.rb +170 -0
- data/lib/abroad/extractors/yaml/rails_extractor.rb +15 -0
- data/lib/abroad/extractors/yaml/yaml_extractor.rb +28 -0
- data/lib/abroad/extractors/yaml.rb +9 -0
- data/lib/abroad/extractors.rb +28 -0
- data/lib/abroad/serializers/json/json_serializer.rb +27 -0
- data/lib/abroad/serializers/json/key_value_serializer.rb +20 -0
- data/lib/abroad/serializers/json.rb +8 -0
- data/lib/abroad/serializers/serializer.rb +52 -0
- data/lib/abroad/serializers/trie.rb +76 -0
- data/lib/abroad/serializers/xml/android_serializer.rb +143 -0
- data/lib/abroad/serializers/xml/xml_serializer.rb +23 -0
- data/lib/abroad/serializers/xml.rb +8 -0
- data/lib/abroad/serializers/yaml/rails_serializer.rb +110 -0
- data/lib/abroad/serializers/yaml/yaml_serializer.rb +19 -0
- data/lib/abroad/serializers/yaml.rb +8 -0
- data/lib/abroad/serializers.rb +29 -0
- data/lib/abroad/version.rb +3 -0
- data/lib/abroad.rb +37 -0
- data/lib/ext/htmlentities/android_xml_decoder.rb +15 -0
- data/lib/ext/htmlentities/android_xml_encoder.rb +23 -0
- data/spec/abroad_spec.rb +35 -0
- data/spec/extractors/json/fixtures/arrays.json +9 -0
- data/spec/extractors/json/fixtures/basic.json +5 -0
- data/spec/extractors/json/fixtures/objects.json +11 -0
- data/spec/extractors/json/fixtures.yml +22 -0
- data/spec/extractors/json/json_extractor_spec.rb +6 -0
- data/spec/extractors/xml/fixtures/basic_arrays.xml +9 -0
- data/spec/extractors/xml/fixtures/basic_plurals.xml +8 -0
- data/spec/extractors/xml/fixtures/basic_strings.xml +8 -0
- data/spec/extractors/xml/fixtures/entities.xml +4 -0
- data/spec/extractors/xml/fixtures/markup.xml +5 -0
- data/spec/extractors/xml/fixtures/newlines.xml +6 -0
- data/spec/extractors/xml/fixtures/quotes.xml +9 -0
- data/spec/extractors/xml/fixtures.yml +54 -0
- data/spec/extractors/xml/xml_extractor_spec.rb +6 -0
- data/spec/extractors/yaml/fixtures/arrays.yml +9 -0
- data/spec/extractors/yaml/fixtures/arrays_and_hashes.yml +29 -0
- data/spec/extractors/yaml/fixtures/invalid_single_quote_escape.yml +2 -0
- data/spec/extractors/yaml/fixtures/invalid_single_quote_escape_array.yml +3 -0
- data/spec/extractors/yaml/fixtures/nesting.yml +19 -0
- data/spec/extractors/yaml/fixtures/short.yml +2 -0
- data/spec/extractors/yaml/fixtures.yml +75 -0
- data/spec/extractors/yaml/jruby_compat_spec.rb +54 -0
- data/spec/extractors/yaml/yaml_extractor_spec.rb +6 -0
- data/spec/serializers/json/key_value_serializer_spec.rb +26 -0
- data/spec/serializers/xml/android_serializer_spec.rb +165 -0
- data/spec/serializers/yaml/rails_serializer_spec.rb +171 -0
- data/spec/spec_helper.rb +43 -0
- metadata +186 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 83e4de2f30d6fd16a710df079d0bc64f3f76da4c
|
4
|
+
data.tar.gz: 9afb235066836270d5eace94db8d5a815ed2c538
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: aa9ccf3bee54d01500b3ecf996d8dcaeed5ae6fb0a27de1adc7d1a31c63797ccd1083432ee5d03132358aba65f7a721282a25947beda6bdf063c17b47bcedd0d
|
7
|
+
data.tar.gz: 5db0947ad41a62c6d3efeeb5ed11e7705fee237eeeebd6e524b7d77418a95b87d5788fac61479f36786f952e2ba85f22fde845c4eff870284d5e91670a237230
|
data/README.md
ADDED
@@ -0,0 +1,142 @@
|
|
1
|
+
[![Build Status](https://travis-ci.org/camertron/abroad.svg?branch=master)](https://travis-ci.org/camertron/abroad)
|
2
|
+
|
3
|
+
abroad
|
4
|
+
====================
|
5
|
+
|
6
|
+
A set of tools for serializing and extracting strings to and from a number of localization file formats. Currently supported formats are:
|
7
|
+
|
8
|
+
1. YAML, both plain and Rails-style
|
9
|
+
2. Android XML
|
10
|
+
3. JSON key/value
|
11
|
+
|
12
|
+
Adding additional extractors and serializers is straightforward; skip to the bottom of this document to learn more.
|
13
|
+
|
14
|
+
## Installation
|
15
|
+
|
16
|
+
`gem install abroad`, or add it to your Gemfile.
|
17
|
+
|
18
|
+
Then, somewhere in your project:
|
19
|
+
|
20
|
+
```ruby
|
21
|
+
require 'abroad'
|
22
|
+
```
|
23
|
+
|
24
|
+
## Introduction
|
25
|
+
|
26
|
+
Most application frameworks specify a way to localize (i.e. translate) UI phrases and other content. Usually this is done via flat, static files that map strings written in a source langage to translations written in any number of target languages. In Ruby on Rails, this is done via YAML files stored in the config/locales directory. Each file contains a series of nested key/value pairs, where the key is a machine-readable, globally unique identifier and the value is a human-readable bit of text meant to be displayed to users of the application. English strings go in config/locales/en.yml, Spanish strings go in config/locales/es.yml and so on. Both en.yml and es.yml contain the same set of keys, but different (translated) values for those keys.
|
27
|
+
|
28
|
+
Localization file formats are usually based on some standard format like YAML, but often extended in unique ways specific to the framework or platform. Interpreting these files can be difficult because of the various edge cases and platform-specific expectations. If you ever find yourself needing to parse or write out compatible files, consider using well-tested tools like the ones in this project.
|
29
|
+
|
30
|
+
## Usage
|
31
|
+
|
32
|
+
Abroad provides extractors for reading keys and values from localization files, and serializers for writing them out. The usage for each is slightly different.
|
33
|
+
|
34
|
+
### Extractors
|
35
|
+
|
36
|
+
Let's say you're working with this Rails YAML file:
|
37
|
+
|
38
|
+
```yaml
|
39
|
+
en:
|
40
|
+
welcome:
|
41
|
+
message: hello
|
42
|
+
goodbye:
|
43
|
+
message: goodbye
|
44
|
+
```
|
45
|
+
|
46
|
+
To extract strings from this file, try something like this:
|
47
|
+
|
48
|
+
```ruby
|
49
|
+
Abroad.extractor('yaml/rails').open('/path/to/en.yml') do |extractor|
|
50
|
+
extractor.extract_each do |key, string|
|
51
|
+
# on first iteration, key == 'welcome.message', string == 'hello'
|
52
|
+
# on second iteration, key == 'goodbye.message', string == 'goodbye'
|
53
|
+
end
|
54
|
+
end
|
55
|
+
```
|
56
|
+
|
57
|
+
The `Abroad.extractor` method returns a registered extractor class, or `nil` if the extractor can't be found. Extractor classes respond to `open`, `from_stream`, and `from_string`, and can be called with or without a block. If passed a block, the file or stream will be automatically closed when the block terminates. If you choose to not pass a block, you're responsible for calling `close` yourself.
|
58
|
+
|
59
|
+
Here's an example with all the steps broken down:
|
60
|
+
|
61
|
+
```ruby
|
62
|
+
extractor_klass = Abroad.extractor('yaml/rails')
|
63
|
+
extractor = extractor_klass.open('/path/to/en.yml')
|
64
|
+
extractor.extract_each do |key, string|
|
65
|
+
...
|
66
|
+
end
|
67
|
+
extractor.close
|
68
|
+
```
|
69
|
+
|
70
|
+
The `extract_each` method on extractor instances returns an enumerable, which means you have access to all the wonderful `Enumerable` methods like `map`, `inject`, etc:
|
71
|
+
|
72
|
+
```ruby
|
73
|
+
Abroad.extractor('yaml/rails').open('/path/to/en.yml') do |extractor|
|
74
|
+
extractor.extract_each.with_object({}) do |(key, string), result|
|
75
|
+
result[key] = string
|
76
|
+
end
|
77
|
+
end
|
78
|
+
```
|
79
|
+
|
80
|
+
To get a list of all available extractors, use the `extractors` method:
|
81
|
+
|
82
|
+
```ruby
|
83
|
+
Abroad.extractors # => ["yaml/rails", "xml/android", ...]
|
84
|
+
```
|
85
|
+
|
86
|
+
### Serializers
|
87
|
+
|
88
|
+
While extractors pull strings out of localization files, serializers write them back in. Serializers conform to a similar interface, but offer different methods to write content out to the stream:
|
89
|
+
|
90
|
+
```ruby
|
91
|
+
Abroad.serializer('yaml/rails').open('/path/to/es.yml') do |serializer|
|
92
|
+
serializer.write_key_value('welcome.message', 'hola')
|
93
|
+
serializer.write_key_value('goodbye.message', 'adios')
|
94
|
+
end
|
95
|
+
```
|
96
|
+
|
97
|
+
In addition to `write_key_value`, serializer instances respond to the `write_raw` method, which is capable of writing raw text to the underlying stream. You might use this method if you needed to write a comment to the file or maybe a preamble at the beginning.
|
98
|
+
|
99
|
+
Serializer classes respond to `from_stream` in addition to `open`. Both methods can be called with or without a block. If passed a block, the file or stream will be automatically closed when the block terminates. If you choose to not pass a block, you're responsible for calling `close` yourself.
|
100
|
+
|
101
|
+
Here's an example with all the steps broken down:
|
102
|
+
|
103
|
+
```ruby
|
104
|
+
serializer_klass = Abroad.serializer('yaml/rails')
|
105
|
+
serializer = serializer_klass.open('/path/to/es.yml')
|
106
|
+
serializer.write_key_value('welcome.message', 'hola')
|
107
|
+
serializer.write_key_value('goodbye.message', 'adios')
|
108
|
+
serializer.close
|
109
|
+
```
|
110
|
+
|
111
|
+
To get a list of all available serializers, use the `serializers` method:
|
112
|
+
|
113
|
+
```ruby
|
114
|
+
Abroad.serializers # => ["yaml/rails", "xml/android", ...]
|
115
|
+
```
|
116
|
+
|
117
|
+
### Writing Your Own
|
118
|
+
|
119
|
+
Conformant _extractors_ should inherit from `Abroad::Extractors::Extractor` and need to define the method `extract_each`. See lib/abroad/extractors/extractor.rb for a quick look at the interface. Methods that raise `NotImplementedError`s are ones you need to define in your subclass.
|
120
|
+
|
121
|
+
Conformant _serializers_ should inherit from `Abroad::Serializers::Serializer` and need to define the `write_key_value` and `write_raw` methods. See lib/abroad/serializers/serializer.rb for a quick look at the interface. Methods that raise `NotImplementedError`s are ones you need to define in your subclass.
|
122
|
+
|
123
|
+
Once you've finished writing your extractor or serializer, register it with Abroad:
|
124
|
+
|
125
|
+
```ruby
|
126
|
+
Abroad::Extractors.register('strings/ios', Strings::IosExtractor)
|
127
|
+
Abroad::Serializers.register('strings/ios', Strings::IosSerializer)
|
128
|
+
```
|
129
|
+
|
130
|
+
The first argument to the `register` method is called the serializer or extractor's _id_. The id can really be anything you want, but Abroad has established a convention of format/framework. The format is the underlying file format (eg. json, yaml, xml, etc), and the framework is the platform or application framework you're targeting (eg. iOS, Android, Rails, Django, etc). This makes it easy to avoid writing "one size fits all" classes. For example, it would be straightforward to add support for Chrome's localization file format, which is just json with a special structure. We might register an extractor with the id json/chrome instead of trying to retrofit our existing json extractor with Chrome-specific functionality.
|
131
|
+
|
132
|
+
## Requirements
|
133
|
+
|
134
|
+
This project has no external requirements.
|
135
|
+
|
136
|
+
## Running Tests
|
137
|
+
|
138
|
+
`bundle exec rake` or `bundle exec rspec` should do the trick.
|
139
|
+
|
140
|
+
## Authors
|
141
|
+
|
142
|
+
* Cameron C. Dutro: http://github.com/camertron
|
data/abroad.gemspec
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
$:.unshift File.join(File.dirname(__FILE__), 'lib')
|
2
|
+
require 'abroad/version'
|
3
|
+
|
4
|
+
Gem::Specification.new do |s|
|
5
|
+
s.name = 'abroad'
|
6
|
+
s.version = ::Abroad::VERSION
|
7
|
+
s.authors = ['Cameron Dutro']
|
8
|
+
s.email = ['camertron@gmail.com']
|
9
|
+
s.homepage = "https://github.com/camertron/abroad"
|
10
|
+
|
11
|
+
s.description = s.summary = 'A set of parsers and serializers for dealing with localization file formats.'
|
12
|
+
|
13
|
+
s.platform = Gem::Platform::RUBY
|
14
|
+
s.has_rdoc = true
|
15
|
+
|
16
|
+
s.add_dependency 'htmlentities', '~> 4.0'
|
17
|
+
s.add_dependency 'json-stream', '~> 0.0'
|
18
|
+
s.add_dependency 'json-write-stream', '~> 1.0'
|
19
|
+
s.add_dependency 'nokogiri', '~> 1.0'
|
20
|
+
s.add_dependency 'xml-write-stream', '~> 1.0'
|
21
|
+
s.add_dependency 'yaml-write-stream', '~> 1.0'
|
22
|
+
|
23
|
+
s.require_path = 'lib'
|
24
|
+
s.files = Dir['{lib,spec}/**/*', 'README.md', 'abroad.gemspec']
|
25
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
require 'stringio'
|
2
|
+
|
3
|
+
module Abroad
|
4
|
+
module Extractors
|
5
|
+
|
6
|
+
class Extractor
|
7
|
+
attr_reader :stream
|
8
|
+
|
9
|
+
class << self
|
10
|
+
def from_stream(stream)
|
11
|
+
extractor = new(stream)
|
12
|
+
|
13
|
+
if block_given?
|
14
|
+
yield(extractor).tap do
|
15
|
+
extractor.close
|
16
|
+
end
|
17
|
+
else
|
18
|
+
extractor
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def from_string(string, &block)
|
23
|
+
from_stream(StringIO.new(string), &block)
|
24
|
+
end
|
25
|
+
|
26
|
+
def open(file, &block)
|
27
|
+
from_stream(File.open(file, 'r'), &block)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def initialize(stream)
|
32
|
+
@stream = stream
|
33
|
+
end
|
34
|
+
|
35
|
+
def extract_each
|
36
|
+
raise NotImplementedError,
|
37
|
+
'expected to be implemented in derived classes'
|
38
|
+
end
|
39
|
+
|
40
|
+
def close
|
41
|
+
stream.close
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
end
|
46
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
require 'json/stream'
|
2
|
+
|
3
|
+
module Abroad
|
4
|
+
module Extractors
|
5
|
+
module Json
|
6
|
+
|
7
|
+
class KeyValueExtractor < JsonExtractor
|
8
|
+
private
|
9
|
+
|
10
|
+
def each_entry
|
11
|
+
open_obj_count = 0
|
12
|
+
open_array_count = 0
|
13
|
+
key = nil
|
14
|
+
|
15
|
+
parser = ::JSON::Stream::Parser.new.tap do |parser|
|
16
|
+
parser.key { |key_str| key = key_str }
|
17
|
+
|
18
|
+
parser.value do |value_str|
|
19
|
+
if block_given? && open_array_count.zero? && open_obj_count == 1
|
20
|
+
yield key, value_str
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
parser.start_object { open_obj_count += 1 }
|
25
|
+
parser.end_object { open_obj_count -= 1 }
|
26
|
+
parser.start_array { open_array_count += 1 }
|
27
|
+
parser.end_array { open_array_count -= 1 }
|
28
|
+
end
|
29
|
+
|
30
|
+
parser << stream.read
|
31
|
+
rescue ::JSON::Stream::ParserError => e
|
32
|
+
raise Abroad::SyntaxError, "Syntax error in json: #{e.message}"
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
@@ -0,0 +1,113 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'ext/htmlentities/android_xml_decoder'
|
3
|
+
|
4
|
+
module Abroad
|
5
|
+
module Extractors
|
6
|
+
module Xml
|
7
|
+
|
8
|
+
class AndroidExtractor < XmlExtractor
|
9
|
+
private
|
10
|
+
|
11
|
+
def each_entry(&block)
|
12
|
+
doc = parse
|
13
|
+
each_string_entry(doc, &block)
|
14
|
+
each_array_entry(doc, &block)
|
15
|
+
each_plural_entry(doc, &block)
|
16
|
+
end
|
17
|
+
|
18
|
+
def each_string_entry(doc)
|
19
|
+
doc.xpath('//string').each do |node|
|
20
|
+
yield name_from(node), text_from(node)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def each_array_entry(doc)
|
25
|
+
doc.xpath('//string-array').each do |array|
|
26
|
+
prefix = name_from(array)
|
27
|
+
|
28
|
+
array.xpath('item').each_with_index do |item, idx|
|
29
|
+
yield "#{prefix}.#{idx}", text_from(item)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def each_plural_entry(doc)
|
35
|
+
doc.xpath('//plurals').each do |plurals|
|
36
|
+
prefix = name_from(plurals)
|
37
|
+
|
38
|
+
plurals.xpath('item').each do |item|
|
39
|
+
quantity = item.attributes['quantity'].value
|
40
|
+
yield "#{prefix}.#{quantity}", text_from(item)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
def text_from(node)
|
46
|
+
builder = Nokogiri::XML::Builder.new do |builder|
|
47
|
+
builder.root do
|
48
|
+
node.children.each do |child|
|
49
|
+
serialize(child, builder)
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
# safe to call `strip` after `to_xml` because any string that
|
55
|
+
# needs leading or trailing whitespace preserved should be wrapped
|
56
|
+
# in double quotes
|
57
|
+
unescape(
|
58
|
+
strip_enclosing_quotes(
|
59
|
+
builder.doc.xpath('/root/node()').to_xml.strip
|
60
|
+
)
|
61
|
+
)
|
62
|
+
end
|
63
|
+
|
64
|
+
def serialize(node, builder)
|
65
|
+
if node.text?
|
66
|
+
builder.text(unescape(node.text))
|
67
|
+
else
|
68
|
+
builder.send("#{node.name}_", node.attributes) do
|
69
|
+
node.children.each do |child|
|
70
|
+
serialize(child, builder)
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
def name_from(node)
|
77
|
+
if attribute = node.attributes['name']
|
78
|
+
attribute.value
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
def unescape(text)
|
83
|
+
text = text
|
84
|
+
.gsub("\\'", "'")
|
85
|
+
.gsub('\\"', '"')
|
86
|
+
.gsub("\\n", "\n")
|
87
|
+
.gsub("\\r", "\r")
|
88
|
+
.gsub("\\t", "\t")
|
89
|
+
|
90
|
+
coder.decode(text)
|
91
|
+
end
|
92
|
+
|
93
|
+
def coder
|
94
|
+
@coder ||= HTMLEntities::AndroidXmlDecoder.new
|
95
|
+
end
|
96
|
+
|
97
|
+
def strip_enclosing_quotes(text)
|
98
|
+
quote = case text[0]
|
99
|
+
when "'", '"'
|
100
|
+
text[0]
|
101
|
+
end
|
102
|
+
|
103
|
+
if quote
|
104
|
+
text.gsub(/\A#{quote}(.*)#{quote}\z/) { $1 }
|
105
|
+
else
|
106
|
+
text
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
|
3
|
+
module Abroad
|
4
|
+
module Extractors
|
5
|
+
module Xml
|
6
|
+
|
7
|
+
class XmlExtractor < Extractor
|
8
|
+
def extract_each(&block)
|
9
|
+
if block_given?
|
10
|
+
each_entry(&block)
|
11
|
+
else
|
12
|
+
to_enum(__method__)
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
private
|
17
|
+
|
18
|
+
def parse
|
19
|
+
Nokogiri::XML(stream) do |config|
|
20
|
+
# don't allow network connections
|
21
|
+
config.options = Nokogiri::XML::ParseOptions::NONET
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
module Abroad
|
2
|
+
module Extractors
|
3
|
+
module Yaml
|
4
|
+
|
5
|
+
class DottedKeyExtractor < YamlExtractor
|
6
|
+
def extract_each(&block)
|
7
|
+
if block_given?
|
8
|
+
walk(parse, [], &block)
|
9
|
+
else
|
10
|
+
to_enum(__method__)
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
private
|
15
|
+
|
16
|
+
def walk(obj, cur_path, &block)
|
17
|
+
case obj
|
18
|
+
when Hash
|
19
|
+
obj.each_pair do |key, val|
|
20
|
+
walk(val, cur_path + [key], &block)
|
21
|
+
end
|
22
|
+
when Array
|
23
|
+
obj.each_with_index do |val, idx|
|
24
|
+
walk(val, cur_path + [idx.to_s], &block)
|
25
|
+
end
|
26
|
+
else
|
27
|
+
yield scrub_path(cur_path).join('.'), obj
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
private
|
32
|
+
|
33
|
+
def scrub_path(path)
|
34
|
+
path # no-op
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
@@ -0,0 +1,170 @@
|
|
1
|
+
require 'yaml'
|
2
|
+
|
3
|
+
java_import 'java.lang.StringBuilder'
|
4
|
+
|
5
|
+
module Abroad
|
6
|
+
module Extractors
|
7
|
+
module Yaml
|
8
|
+
# For some reason, Psych in MRI thinks it's valid yaml syntax to escape
|
9
|
+
# single quotes (surprise, it's not). This class is a performant way to
|
10
|
+
# identify and correct such invalid escaping. Be warned, it's a fairly
|
11
|
+
# naïve implementation.
|
12
|
+
#
|
13
|
+
# NOTE: this technique will only work with YAML emitted in block mode,
|
14
|
+
# i.e. yaml that uses indentation and newlines for each key/value pair
|
15
|
+
# and array element
|
16
|
+
class JRubyCompat
|
17
|
+
|
18
|
+
UNKNOWN_ESCAPE_MSG = "found unknown escape character '(39)"
|
19
|
+
UNEXPECTED_EOS_MSG = 'found unexpected end of stream while scanning a quoted scalar'
|
20
|
+
|
21
|
+
class << self
|
22
|
+
def clean(yaml_content)
|
23
|
+
# convert to java string for performance reasons (don't have to
|
24
|
+
# coerce/convert from java to ruby and back again)
|
25
|
+
# https://github.com/jruby/jruby/wiki/ImprovingJavaIntegrationPerformance#pre-coerce-values-used-repeatedly
|
26
|
+
yaml_content = yaml_content.to_java
|
27
|
+
ranges = identify_ranges(yaml_content)
|
28
|
+
reconstruct(yaml_content, ranges)
|
29
|
+
end
|
30
|
+
|
31
|
+
private
|
32
|
+
|
33
|
+
# given a list of range/substitution pairs, reconstruct the yaml_content
|
34
|
+
# string so it contains the substitutions
|
35
|
+
def reconstruct(yaml_content, ranges)
|
36
|
+
builder = StringBuilder.new
|
37
|
+
index = 0
|
38
|
+
|
39
|
+
ranges.each do |range|
|
40
|
+
builder.append(yaml_content, index, range.first.first)
|
41
|
+
builder.append(range.last)
|
42
|
+
index = range.first.last
|
43
|
+
end
|
44
|
+
|
45
|
+
builder.append(yaml_content, index, yaml_content.length)
|
46
|
+
builder.toString
|
47
|
+
end
|
48
|
+
|
49
|
+
# Find the problem areas in yaml_content. Returns a list of range/substitution
|
50
|
+
# pairs of the form [[start, finish], substitution]
|
51
|
+
def identify_ranges(yaml_content)
|
52
|
+
each_line_indices(yaml_content).each_with_object([]) do |(start, finish), ranges|
|
53
|
+
if range = clean_line(yaml_content, start, finish)
|
54
|
+
ranges << range
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
def clean_line(yaml_content, start, finish)
|
60
|
+
unless array_element?(yaml_content, start, finish)
|
61
|
+
value_start, value_finish = find_value(yaml_content, start, finish)
|
62
|
+
|
63
|
+
if value_start && double_quoted?(yaml_content, value_start, value_finish)
|
64
|
+
clean_value(yaml_content, value_start, value_finish)
|
65
|
+
end
|
66
|
+
else
|
67
|
+
clean_value(yaml_content, start, finish)
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
# if the yaml parser (snakeyaml in jruby's case) can parse the value,
|
72
|
+
# return nil. If not, attempt to fix the problem by returning
|
73
|
+
# a range/substitution pair.
|
74
|
+
def clean_value(yaml_content, start, finish)
|
75
|
+
value = yaml_content.substring(start, finish)
|
76
|
+
|
77
|
+
begin
|
78
|
+
YAML.load(value)
|
79
|
+
nil
|
80
|
+
rescue Psych::SyntaxError => e
|
81
|
+
if error_type = error_type_for(e)
|
82
|
+
fix(error_type, yaml_content, start, finish, value)
|
83
|
+
else
|
84
|
+
raise e
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
# naïvely try to fix the problem by replacing escaped single quotes
|
90
|
+
def clean_string(value)
|
91
|
+
value.gsub("\\'", "'")
|
92
|
+
end
|
93
|
+
|
94
|
+
def fix(error_type, yaml_content, start, finish, value)
|
95
|
+
# don't do anything with error type for now
|
96
|
+
[[start, finish], clean_string(value)]
|
97
|
+
end
|
98
|
+
|
99
|
+
def error_type_for(e)
|
100
|
+
if e.message.include?(UNKNOWN_ESCAPE_MSG)
|
101
|
+
:unknown_escape_sequence
|
102
|
+
elsif e.message.include?(UNEXPECTED_EOS_MSG)
|
103
|
+
:unexpected_eos
|
104
|
+
else
|
105
|
+
:unknown
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
# rather naïvely isolates the value in a yaml key/value pair by searching
|
110
|
+
# for the first colon
|
111
|
+
def find_value(yaml_content, start, finish)
|
112
|
+
value_start = yaml_content.indexOf(':', start)
|
113
|
+
|
114
|
+
if value_start <= finish
|
115
|
+
[value_start + 1, finish]
|
116
|
+
else
|
117
|
+
[nil, nil]
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
# is this chunk of text double-quoted?
|
122
|
+
def double_quoted?(yaml_content, start, finish)
|
123
|
+
starts_with?('"'.ord, yaml_content, start, finish)
|
124
|
+
end
|
125
|
+
|
126
|
+
# is this line of yaml an array element, i.e. does it begin with "-"?
|
127
|
+
def array_element?(yaml_content, start, finish)
|
128
|
+
starts_with?('-'.ord, yaml_content, start, finish)
|
129
|
+
end
|
130
|
+
|
131
|
+
def starts_with?(charcode, yaml_content, start, finish)
|
132
|
+
(start..finish).each do |index|
|
133
|
+
return false if index >= yaml_content.length
|
134
|
+
unless yaml_content.charAt(index) == 32
|
135
|
+
break yaml_content.charAt(index) == charcode
|
136
|
+
end
|
137
|
+
end
|
138
|
+
end
|
139
|
+
|
140
|
+
# Iterates over the yaml string and yields ranges that encapsulate
|
141
|
+
# each "line". Expects lines to be delimited by newlines ("\n").
|
142
|
+
def each_line_indices(yaml_content)
|
143
|
+
if block_given?
|
144
|
+
index = 0
|
145
|
+
|
146
|
+
loop do
|
147
|
+
next_index = yaml_content.indexOf("\n", index)
|
148
|
+
|
149
|
+
if next_index == -1
|
150
|
+
if index < yaml_content.length
|
151
|
+
yield index + 1, yaml_content.length
|
152
|
+
end
|
153
|
+
|
154
|
+
break
|
155
|
+
else
|
156
|
+
yield index, next_index + 1
|
157
|
+
end
|
158
|
+
|
159
|
+
index = next_index + 1
|
160
|
+
end
|
161
|
+
else
|
162
|
+
to_enum(__method__, yaml_content)
|
163
|
+
end
|
164
|
+
end
|
165
|
+
end
|
166
|
+
|
167
|
+
end
|
168
|
+
end
|
169
|
+
end
|
170
|
+
end
|