txml_importer 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 408bb4b21de0373f60c443d3bed90f95ccf26465
4
+ data.tar.gz: 27f0547d2809c1418f0ff6b1473e8bc4e56de6f1
5
+ SHA512:
6
+ metadata.gz: 7c0ef7a29f39360b50541b6af82a7d1ec1d7824ae1bded610d4358d9fe57657341b9dc9289a417c790987955408b7ae77207736ba17e95b72f91c392719d9a30
7
+ data.tar.gz: 426d75dc482a762b15506a3ce92205e3ed80c7f92bb53541f76147994f0006211a5233cc50965f988671b7539b861a87775aa0df9c5734abca33bfefd29cfb96
data/.gitignore ADDED
@@ -0,0 +1,9 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --format documentation
2
+ --color
data/.travis.yml ADDED
@@ -0,0 +1,3 @@
1
+ language: ruby
2
+ rvm:
3
+ - 2.2.4
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in txml_importer.gemspec
4
+ gemspec
data/README.md ADDED
@@ -0,0 +1,70 @@
1
+ # TXML Importer
2
+
3
+ [![Gem Version](https://badge.fury.io/rb/txml_importer.svg)](https://badge.fury.io/rb/txml_importer) [![Build Status](https://travis-ci.org/diasks2/txml_importer.png)](https://travis-ci.org/diasks2/txml_importer) [![License](https://img.shields.io/badge/license-MIT-brightgreen.svg?style=flat)](https://github.com/diasks2/txml_importer/blob/master/LICENSE.txt)
4
+
5
+ This gem handles the importing and parsing of .txml translation memory files.
6
+
7
+ ## Installation
8
+
9
+ Add this line to your application's Gemfile:
10
+
11
+ **Ruby**
12
+ ```
13
+ gem install txml_importer
14
+ ```
15
+
16
+ **Ruby on Rails**
17
+ Add this line to your application’s Gemfile:
18
+ ```ruby
19
+ gem 'txml_importer'
20
+ ```
21
+
22
+ ## Usage
23
+
24
+ ```ruby
25
+ # Get the high level stats of a .txml file
26
+ # Including the encoding is optional. If not included the gem will attempt to detect the encoding.
27
+ file_path = File.expand_path('../txml_importer/spec/test_sample_files/sample.txml')
28
+ TxmlImporter::Txml.new(file_path: file_path).stats
29
+ # => {:tu_count=>1, :seg_count=>2, :language_pairs=>[["en", "fr"]]}
30
+
31
+ # Extract the segments of a .txml file
32
+ # Result: [translation_units, segments]
33
+ # translation_units = [tu_id]
34
+ # segments = [tu_id, segment_role, word_count, language, segment_text]
35
+
36
+ TxmlImporter::Txml.new(file_path: file_path).import
37
+ # => [[["6234-1457917153-1"]], [["6234-1457917153-1", "source", 2, "en", "Hello world"], ["6234-1457917153-1", "target", 3, "fr", "Bonjour le monde"]]]
38
+ ```
39
+
40
+ ## Contributing
41
+
42
+ 1. Fork it ( https://github.com/diasks2/txml_importer/fork )
43
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
44
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
45
+ 4. Push to the branch (`git push origin my-new-feature`)
46
+ 5. Create a new Pull Request
47
+
48
+ ## License
49
+
50
+ The MIT License (MIT)
51
+
52
+ Copyright (c) 2016 Kevin S. Dias
53
+
54
+ Permission is hereby granted, free of charge, to any person obtaining a copy
55
+ of this software and associated documentation files (the "Software"), to deal
56
+ in the Software without restriction, including without limitation the rights
57
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
58
+ copies of the Software, and to permit persons to whom the Software is
59
+ furnished to do so, subject to the following conditions:
60
+
61
+ The above copyright notice and this permission notice shall be included in
62
+ all copies or substantial portions of the Software.
63
+
64
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
65
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
66
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
67
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
68
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
69
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
70
+ THE SOFTWARE.
data/Rakefile ADDED
@@ -0,0 +1,5 @@
1
+ require 'bundler/gem_tasks'
2
+ require 'rspec/core/rake_task'
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+ task default: :spec
@@ -0,0 +1,3 @@
1
+ module TxmlImporter
2
+ VERSION = "0.1.0"
3
+ end
@@ -0,0 +1,129 @@
1
+ require 'txml_importer/version'
2
+ require 'xml'
3
+ require 'open-uri'
4
+ require 'pretty_strings'
5
+ require 'charlock_holmes'
6
+
7
+ module TxmlImporter
8
+ class Txml
9
+ attr_reader :file_path, :encoding
10
+ def initialize(file_path:, **args)
11
+ @file_path = file_path
12
+ @content = File.read(open(@file_path)) if !args[:encoding].eql?('UTF-8')
13
+ if args[:encoding].nil?
14
+ @encoding = CharlockHolmes::EncodingDetector.detect(@content[0..100_000])[:encoding]
15
+ if @encoding.nil?
16
+ encoding_in_file = @content.dup.force_encoding('utf-8').scrub!("*").gsub!(/\0/, '').scan(/(?<=encoding=").*(?=")/)[0].upcase
17
+ if encoding_in_file.eql?('UTF-8')
18
+ @encoding = ('UTF-8')
19
+ elsif encoding_in_file.eql?('UTF-16')
20
+ @encoding = ('UTF-16LE')
21
+ end
22
+ end
23
+ else
24
+ @encoding = args[:encoding].upcase
25
+ end
26
+ @doc = {
27
+ source_language: "",
28
+ tu: { id: "", counter: 0, vals: [] },
29
+ seg: { counter: 0, vals: [] },
30
+ language_pairs: []
31
+ }
32
+ raise "Encoding type could not be determined. Please set an encoding of UTF-8, UTF-16LE, or UTF-16BE" if @encoding.nil?
33
+ raise "Encoding type not supported. Please choose an encoding of UTF-8, UTF-16LE, or UTF-16BE" unless @encoding.eql?('UTF-8') || @encoding.eql?('UTF-16LE') || @encoding.eql?('UTF-16BE')
34
+ @text = CharlockHolmes::Converter.convert(@content, @encoding, 'UTF-8') if !@encoding.eql?('UTF-8')
35
+ end
36
+
37
+ def stats
38
+ if encoding.eql?('UTF-8')
39
+ analyze_stats_utf_8
40
+ else
41
+ analyze_stats_utf_16
42
+ end
43
+ {tu_count: @doc[:tu][:counter], seg_count: @doc[:seg][:counter], language_pairs: @doc[:language_pairs].uniq}
44
+ end
45
+
46
+ def import
47
+ reader = read_file
48
+ parse_file(reader)
49
+ [@doc[:tu][:vals], @doc[:seg][:vals]]
50
+ end
51
+
52
+ private
53
+
54
+ def analyze_stats_utf_8
55
+ File.readlines(@file_path).each do |line|
56
+ analyze_line(line)
57
+ end
58
+ end
59
+
60
+ def analyze_stats_utf_16
61
+ @text.each_line do |line|
62
+ analyze_line(line)
63
+ end
64
+ end
65
+
66
+ def read_file
67
+ if encoding.eql?('UTF-8')
68
+ XML::Reader.io(open(file_path), options: XML::Parser::Options::NOERROR, encoding: XML::Encoding::UTF_8)
69
+ else
70
+ reader = @text.gsub(/(?<=encoding=").*(?=")/, 'utf-8').gsub(/&#x[0-1]?[0-9a-fA-F];/, ' ').gsub(/[\0-\x1f\x7f\u2028]/, ' ')
71
+ XML::Reader.string(reader, options: XML::Parser::Options::NOERROR, encoding: XML::Encoding::UTF_8)
72
+ end
73
+ end
74
+
75
+ def analyze_line(line)
76
+ @doc[:source_language] = line.scan(/(?<=locale=\S)\S+(?=")/)[0] if line.include?('locale=') && !line.scan(/(?<=locale=\S)\S+(?=")/).empty?
77
+ @doc[:source_language] = line.scan(/(?<=locale=\S)\S+(?=')/)[0] if line.include?('locale=') && !line.scan(/(?<=locale=\S)\S+(?=')/).empty?
78
+ @doc[:target_language] = line.scan(/(?<=targetlocale=\S)\S+(?=")/)[0] if line.include?('targetlocale=') && !line.scan(/(?<=targetlocale=\S)\S+(?=")/).empty?
79
+ @doc[:target_language] = line.scan(/(?<=targetlocale=\S)\S+(?=')/)[0] if line.include?('targetlocale=') && !line.scan(/(?<=targetlocale=\S)\S+(?=')/).empty?
80
+ @doc[:tu][:counter] += line.scan(/<\/segment>/).count
81
+ @doc[:seg][:counter] += (line.scan(/<\/source>/).count + line.scan(/<\/target>/).count - line.scan(/<\/target><\/revision>/).count)
82
+ @doc[:language_pairs] << [@doc[:source_language], @doc[:target_language]] if !@doc[:source_language].empty? && !@doc[:source_language].nil? && !@doc[:target_language].empty? && !@doc[:target_language].nil?
83
+ end
84
+
85
+ def parse_file(reader)
86
+ last_tag = ''
87
+ @count = 0
88
+ while reader.read do
89
+ unless last_tag.bytes.to_a.eql?([114, 101, 118, 105, 115, 105, 111, 110])
90
+ case reader.name.bytes.to_a
91
+ when [116, 120, 109, 108]
92
+ @doc[:source_language] = reader.get_attribute("locale") if reader.has_attributes? && reader.get_attribute("locale")
93
+ @doc[:target_language] = reader.get_attribute("targetlocale") if reader.has_attributes? && reader.get_attribute("targetlocale")
94
+ when [115, 101, 103, 109, 101, 110, 116]
95
+ generate_unique_id if @count % 2 == 0
96
+ write_tu(reader) if @count % 2 == 0
97
+ @count += 1
98
+ when [115, 111, 117, 114, 99, 101]
99
+ write_seg(reader, 'source')
100
+ when [116, 97, 114, 103, 101, 116]
101
+ write_seg(reader, 'target')
102
+ end
103
+ end
104
+ last_tag = reader.name
105
+ end
106
+ reader.close
107
+ end
108
+
109
+ def write_tu(reader)
110
+ @doc[:tu][:vals] << [@doc[:tu][:id]]
111
+ end
112
+
113
+ def write_seg(reader, role)
114
+ return if reader.read_string.nil?
115
+ text = PrettyStrings::Cleaner.new(reader.read_string).pretty.gsub("\\","&#92;").gsub("'",%q(\\\'))
116
+ word_count = text.gsub("\s+", ' ').split(' ').length
117
+ if role.eql?('source')
118
+ language = @doc[:source_language]
119
+ else
120
+ language = @doc[:target_language]
121
+ end
122
+ @doc[:seg][:vals] << [@doc[:tu][:id], role, word_count, language, text]
123
+ end
124
+
125
+ def generate_unique_id
126
+ @doc[:tu][:id] = [(1..4).map{rand(10)}.join(''), Time.now.to_i, @doc[:tu][:counter] += 1 ].join("-")
127
+ end
128
+ end
129
+ end
@@ -0,0 +1,2 @@
1
+ $LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
2
+ require 'txml_importer'
@@ -0,0 +1,47 @@
1
+ require 'spec_helper'
2
+
3
+ describe TxmlImporter do
4
+ it 'has a version number' do
5
+ expect(TxmlImporter::VERSION).not_to be nil
6
+ end
7
+
8
+ describe '#stats' do
9
+ it 'reports the stats of a .txml file' do
10
+ file_path = File.expand_path('../txml_importer/spec/sample_files/sample_1.txml')
11
+ txml = TxmlImporter::Txml.new(file_path: file_path)
12
+ expect(txml.stats).to eq({:tu_count=>112, :seg_count=>224, :language_pairs=>[["FR-FR", "EN"]]})
13
+ end
14
+ end
15
+
16
+ describe '#import' do
17
+ it 'imports a .txml file' do
18
+ file_path = File.expand_path('../txml_importer/spec/sample_files/sample_1.txml')
19
+ txml = TxmlImporter::Txml.new(file_path: file_path).import
20
+ expect(txml[0].length).to eq(112)
21
+ end
22
+
23
+ it 'imports a .txml file' do
24
+ file_path = File.expand_path('../txml_importer/spec/sample_files/sample_1.txml')
25
+ txml = TxmlImporter::Txml.new(file_path: file_path).import
26
+ expect(txml[1].length).to eq(224)
27
+ end
28
+
29
+ it 'imports a .txml file' do
30
+ file_path = File.expand_path('../txml_importer/spec/sample_files/sample_1.txml')
31
+ txml = TxmlImporter::Txml.new(file_path: file_path).import
32
+ expect(txml[0][0][0]).to eq(txml[1][0][0])
33
+ end
34
+
35
+ it 'imports a .txml file' do
36
+ file_path = File.expand_path('../txml_importer/spec/sample_files/sample_1.txml')
37
+ txml = TxmlImporter::Txml.new(file_path: file_path).import
38
+ expect(txml[0][-1][0]).to eq(txml[1][-1][0])
39
+ end
40
+
41
+ it 'imports a .txml file' do
42
+ file_path = File.expand_path('../txml_importer/spec/sample_files/sample_1.txml')
43
+ txml = TxmlImporter::Txml.new(file_path: file_path)
44
+ expect(txml.import[1][-1][1]).to eq('target')
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,27 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'txml_importer/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "txml_importer"
8
+ spec.version = TxmlImporter::VERSION
9
+ spec.authors = ["Kevin S. Dias"]
10
+ spec.email = ["diasks2@gmail.com"]
11
+
12
+ spec.summary = %q{.txml file importer}
13
+ spec.description = %q{Import the content of a .txml file.}
14
+ spec.homepage = "https://github.com/diasks2/txml_importer"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0")
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.9"
22
+ spec.add_development_dependency "rake", "~> 10.0"
23
+ spec.add_development_dependency "rspec"
24
+ spec.add_runtime_dependency "libxml-ruby"
25
+ spec.add_runtime_dependency "pretty_strings", "~> 0.5.0"
26
+ spec.add_runtime_dependency "charlock_holmes_bundle_icu", "~> 0.6.9.2"
27
+ end
metadata ADDED
@@ -0,0 +1,140 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: txml_importer
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Kevin S. Dias
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2016-03-14 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.9'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.9'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: libxml-ruby
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: pretty_strings
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: 0.5.0
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: 0.5.0
83
+ - !ruby/object:Gem::Dependency
84
+ name: charlock_holmes_bundle_icu
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: 0.6.9.2
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: 0.6.9.2
97
+ description: Import the content of a .txml file.
98
+ email:
99
+ - diasks2@gmail.com
100
+ executables: []
101
+ extensions: []
102
+ extra_rdoc_files: []
103
+ files:
104
+ - ".gitignore"
105
+ - ".rspec"
106
+ - ".travis.yml"
107
+ - Gemfile
108
+ - README.md
109
+ - Rakefile
110
+ - lib/txml_importer.rb
111
+ - lib/txml_importer/version.rb
112
+ - spec/spec_helper.rb
113
+ - spec/txml_importer_spec.rb
114
+ - txml_importer.gemspec
115
+ homepage: https://github.com/diasks2/txml_importer
116
+ licenses: []
117
+ metadata: {}
118
+ post_install_message:
119
+ rdoc_options: []
120
+ require_paths:
121
+ - lib
122
+ required_ruby_version: !ruby/object:Gem::Requirement
123
+ requirements:
124
+ - - ">="
125
+ - !ruby/object:Gem::Version
126
+ version: '0'
127
+ required_rubygems_version: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - ">="
130
+ - !ruby/object:Gem::Version
131
+ version: '0'
132
+ requirements: []
133
+ rubyforge_project:
134
+ rubygems_version: 2.4.1
135
+ signing_key:
136
+ specification_version: 4
137
+ summary: ".txml file importer"
138
+ test_files:
139
+ - spec/spec_helper.rb
140
+ - spec/txml_importer_spec.rb