txml_importer 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 408bb4b21de0373f60c443d3bed90f95ccf26465
4
+ data.tar.gz: 27f0547d2809c1418f0ff6b1473e8bc4e56de6f1
5
+ SHA512:
6
+ metadata.gz: 7c0ef7a29f39360b50541b6af82a7d1ec1d7824ae1bded610d4358d9fe57657341b9dc9289a417c790987955408b7ae77207736ba17e95b72f91c392719d9a30
7
+ data.tar.gz: 426d75dc482a762b15506a3ce92205e3ed80c7f92bb53541f76147994f0006211a5233cc50965f988671b7539b861a87775aa0df9c5734abca33bfefd29cfb96
data/.gitignore ADDED
@@ -0,0 +1,9 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --format documentation
2
+ --color
data/.travis.yml ADDED
@@ -0,0 +1,3 @@
1
+ language: ruby
2
+ rvm:
3
+ - 2.2.4
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in txml_importer.gemspec
4
+ gemspec
data/README.md ADDED
@@ -0,0 +1,70 @@
1
+ # TXML Importer
2
+
3
+ [![Gem Version](https://badge.fury.io/rb/txml_importer.svg)](https://badge.fury.io/rb/txml_importer) [![Build Status](https://travis-ci.org/diasks2/txml_importer.png)](https://travis-ci.org/diasks2/txml_importer) [![License](https://img.shields.io/badge/license-MIT-brightgreen.svg?style=flat)](https://github.com/diasks2/txml_importer/blob/master/LICENSE.txt)
4
+
5
+ This gem handles the importing and parsing of .txml translation memory files.
6
+
7
+ ## Installation
8
+
9
+ Add this line to your application's Gemfile:
10
+
11
+ **Ruby**
12
+ ```
13
+ gem install txml_importer
14
+ ```
15
+
16
+ **Ruby on Rails**
17
+ Add this line to your application’s Gemfile:
18
+ ```ruby
19
+ gem 'txml_importer'
20
+ ```
21
+
22
+ ## Usage
23
+
24
+ ```ruby
25
+ # Get the high level stats of a .txml file
26
+ # Including the encoding is optional. If not included the gem will attempt to detect the encoding.
27
+ file_path = File.expand_path('../txml_importer/spec/test_sample_files/sample.txml')
28
+ TxmlImporter::Txml.new(file_path: file_path).stats
29
+ # => {:tu_count=>1, :seg_count=>2, :language_pairs=>[["en", "fr"]]}
30
+
31
+ # Extract the segments of a .txml file
32
+ # Result: [translation_units, segments]
33
+ # translation_units = [tu_id]
34
+ # segments = [tu_id, segment_role, word_count, language, segment_text]
35
+
36
+ TxmlImporter::Txml.new(file_path: file_path).import
37
+ # => [[["6234-1457917153-1"]], [["6234-1457917153-1", "source", 2, "en", "Hello world"], ["6234-1457917153-1", "target", 3, "fr", "Bonjour le monde"]]]
38
+ ```
39
+
40
+ ## Contributing
41
+
42
+ 1. Fork it ( https://github.com/diasks2/txml_importer/fork )
43
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
44
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
45
+ 4. Push to the branch (`git push origin my-new-feature`)
46
+ 5. Create a new Pull Request
47
+
48
+ ## License
49
+
50
+ The MIT License (MIT)
51
+
52
+ Copyright (c) 2016 Kevin S. Dias
53
+
54
+ Permission is hereby granted, free of charge, to any person obtaining a copy
55
+ of this software and associated documentation files (the "Software"), to deal
56
+ in the Software without restriction, including without limitation the rights
57
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
58
+ copies of the Software, and to permit persons to whom the Software is
59
+ furnished to do so, subject to the following conditions:
60
+
61
+ The above copyright notice and this permission notice shall be included in
62
+ all copies or substantial portions of the Software.
63
+
64
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
65
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
66
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
67
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
68
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
69
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
70
+ THE SOFTWARE.
data/Rakefile ADDED
@@ -0,0 +1,5 @@
1
+ require 'bundler/gem_tasks'
2
+ require 'rspec/core/rake_task'
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+ task default: :spec
@@ -0,0 +1,3 @@
1
+ module TxmlImporter
2
+ VERSION = "0.1.0"
3
+ end
@@ -0,0 +1,129 @@
1
+ require 'txml_importer/version'
2
+ require 'xml'
3
+ require 'open-uri'
4
+ require 'pretty_strings'
5
+ require 'charlock_holmes'
6
+
7
+ module TxmlImporter
8
+ class Txml
9
+ attr_reader :file_path, :encoding
10
+ def initialize(file_path:, **args)
11
+ @file_path = file_path
12
+ @content = File.read(open(@file_path)) if !args[:encoding].eql?('UTF-8')
13
+ if args[:encoding].nil?
14
+ @encoding = CharlockHolmes::EncodingDetector.detect(@content[0..100_000])[:encoding]
15
+ if @encoding.nil?
16
+ encoding_in_file = @content.dup.force_encoding('utf-8').scrub!("*").gsub!(/\0/, '').scan(/(?<=encoding=").*(?=")/)[0].upcase
17
+ if encoding_in_file.eql?('UTF-8')
18
+ @encoding = ('UTF-8')
19
+ elsif encoding_in_file.eql?('UTF-16')
20
+ @encoding = ('UTF-16LE')
21
+ end
22
+ end
23
+ else
24
+ @encoding = args[:encoding].upcase
25
+ end
26
+ @doc = {
27
+ source_language: "",
28
+ tu: { id: "", counter: 0, vals: [] },
29
+ seg: { counter: 0, vals: [] },
30
+ language_pairs: []
31
+ }
32
+ raise "Encoding type could not be determined. Please set an encoding of UTF-8, UTF-16LE, or UTF-16BE" if @encoding.nil?
33
+ raise "Encoding type not supported. Please choose an encoding of UTF-8, UTF-16LE, or UTF-16BE" unless @encoding.eql?('UTF-8') || @encoding.eql?('UTF-16LE') || @encoding.eql?('UTF-16BE')
34
+ @text = CharlockHolmes::Converter.convert(@content, @encoding, 'UTF-8') if !@encoding.eql?('UTF-8')
35
+ end
36
+
37
+ def stats
38
+ if encoding.eql?('UTF-8')
39
+ analyze_stats_utf_8
40
+ else
41
+ analyze_stats_utf_16
42
+ end
43
+ {tu_count: @doc[:tu][:counter], seg_count: @doc[:seg][:counter], language_pairs: @doc[:language_pairs].uniq}
44
+ end
45
+
46
+ def import
47
+ reader = read_file
48
+ parse_file(reader)
49
+ [@doc[:tu][:vals], @doc[:seg][:vals]]
50
+ end
51
+
52
+ private
53
+
54
+ def analyze_stats_utf_8
55
+ File.readlines(@file_path).each do |line|
56
+ analyze_line(line)
57
+ end
58
+ end
59
+
60
+ def analyze_stats_utf_16
61
+ @text.each_line do |line|
62
+ analyze_line(line)
63
+ end
64
+ end
65
+
66
+ def read_file
67
+ if encoding.eql?('UTF-8')
68
+ XML::Reader.io(open(file_path), options: XML::Parser::Options::NOERROR, encoding: XML::Encoding::UTF_8)
69
+ else
70
+ reader = @text.gsub(/(?<=encoding=").*(?=")/, 'utf-8').gsub(/&#x[0-1]?[0-9a-fA-F];/, ' ').gsub(/[\0-\x1f\x7f\u2028]/, ' ')
71
+ XML::Reader.string(reader, options: XML::Parser::Options::NOERROR, encoding: XML::Encoding::UTF_8)
72
+ end
73
+ end
74
+
75
+ def analyze_line(line)
76
+ @doc[:source_language] = line.scan(/(?<=locale=\S)\S+(?=")/)[0] if line.include?('locale=') && !line.scan(/(?<=locale=\S)\S+(?=")/).empty?
77
+ @doc[:source_language] = line.scan(/(?<=locale=\S)\S+(?=')/)[0] if line.include?('locale=') && !line.scan(/(?<=locale=\S)\S+(?=')/).empty?
78
+ @doc[:target_language] = line.scan(/(?<=targetlocale=\S)\S+(?=")/)[0] if line.include?('targetlocale=') && !line.scan(/(?<=targetlocale=\S)\S+(?=")/).empty?
79
+ @doc[:target_language] = line.scan(/(?<=targetlocale=\S)\S+(?=')/)[0] if line.include?('targetlocale=') && !line.scan(/(?<=targetlocale=\S)\S+(?=')/).empty?
80
+ @doc[:tu][:counter] += line.scan(/<\/segment>/).count
81
+ @doc[:seg][:counter] += (line.scan(/<\/source>/).count + line.scan(/<\/target>/).count - line.scan(/<\/target><\/revision>/).count)
82
+ @doc[:language_pairs] << [@doc[:source_language], @doc[:target_language]] if !@doc[:source_language].empty? && !@doc[:source_language].nil? && !@doc[:target_language].empty? && !@doc[:target_language].nil?
83
+ end
84
+
85
+ def parse_file(reader)
86
+ last_tag = ''
87
+ @count = 0
88
+ while reader.read do
89
+ unless last_tag.bytes.to_a.eql?([114, 101, 118, 105, 115, 105, 111, 110])
90
+ case reader.name.bytes.to_a
91
+ when [116, 120, 109, 108]
92
+ @doc[:source_language] = reader.get_attribute("locale") if reader.has_attributes? && reader.get_attribute("locale")
93
+ @doc[:target_language] = reader.get_attribute("targetlocale") if reader.has_attributes? && reader.get_attribute("targetlocale")
94
+ when [115, 101, 103, 109, 101, 110, 116]
95
+ generate_unique_id if @count % 2 == 0
96
+ write_tu(reader) if @count % 2 == 0
97
+ @count += 1
98
+ when [115, 111, 117, 114, 99, 101]
99
+ write_seg(reader, 'source')
100
+ when [116, 97, 114, 103, 101, 116]
101
+ write_seg(reader, 'target')
102
+ end
103
+ end
104
+ last_tag = reader.name
105
+ end
106
+ reader.close
107
+ end
108
+
109
+ def write_tu(reader)
110
+ @doc[:tu][:vals] << [@doc[:tu][:id]]
111
+ end
112
+
113
+ def write_seg(reader, role)
114
+ return if reader.read_string.nil?
115
+ text = PrettyStrings::Cleaner.new(reader.read_string).pretty.gsub("\\","&#92;").gsub("'",%q(\\\'))
116
+ word_count = text.gsub("\s+", ' ').split(' ').length
117
+ if role.eql?('source')
118
+ language = @doc[:source_language]
119
+ else
120
+ language = @doc[:target_language]
121
+ end
122
+ @doc[:seg][:vals] << [@doc[:tu][:id], role, word_count, language, text]
123
+ end
124
+
125
+ def generate_unique_id
126
+ @doc[:tu][:id] = [(1..4).map{rand(10)}.join(''), Time.now.to_i, @doc[:tu][:counter] += 1 ].join("-")
127
+ end
128
+ end
129
+ end
@@ -0,0 +1,2 @@
1
+ $LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
2
+ require 'txml_importer'
@@ -0,0 +1,47 @@
1
+ require 'spec_helper'
2
+
3
+ describe TxmlImporter do
4
+ it 'has a version number' do
5
+ expect(TxmlImporter::VERSION).not_to be nil
6
+ end
7
+
8
+ describe '#stats' do
9
+ it 'reports the stats of a .txml file' do
10
+ file_path = File.expand_path('../txml_importer/spec/sample_files/sample_1.txml')
11
+ txml = TxmlImporter::Txml.new(file_path: file_path)
12
+ expect(txml.stats).to eq({:tu_count=>112, :seg_count=>224, :language_pairs=>[["FR-FR", "EN"]]})
13
+ end
14
+ end
15
+
16
+ describe '#import' do
17
+ it 'imports a .txml file' do
18
+ file_path = File.expand_path('../txml_importer/spec/sample_files/sample_1.txml')
19
+ txml = TxmlImporter::Txml.new(file_path: file_path).import
20
+ expect(txml[0].length).to eq(112)
21
+ end
22
+
23
+ it 'imports a .txml file' do
24
+ file_path = File.expand_path('../txml_importer/spec/sample_files/sample_1.txml')
25
+ txml = TxmlImporter::Txml.new(file_path: file_path).import
26
+ expect(txml[1].length).to eq(224)
27
+ end
28
+
29
+ it 'imports a .txml file' do
30
+ file_path = File.expand_path('../txml_importer/spec/sample_files/sample_1.txml')
31
+ txml = TxmlImporter::Txml.new(file_path: file_path).import
32
+ expect(txml[0][0][0]).to eq(txml[1][0][0])
33
+ end
34
+
35
+ it 'imports a .txml file' do
36
+ file_path = File.expand_path('../txml_importer/spec/sample_files/sample_1.txml')
37
+ txml = TxmlImporter::Txml.new(file_path: file_path).import
38
+ expect(txml[0][-1][0]).to eq(txml[1][-1][0])
39
+ end
40
+
41
+ it 'imports a .txml file' do
42
+ file_path = File.expand_path('../txml_importer/spec/sample_files/sample_1.txml')
43
+ txml = TxmlImporter::Txml.new(file_path: file_path)
44
+ expect(txml.import[1][-1][1]).to eq('target')
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,27 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'txml_importer/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "txml_importer"
8
+ spec.version = TxmlImporter::VERSION
9
+ spec.authors = ["Kevin S. Dias"]
10
+ spec.email = ["diasks2@gmail.com"]
11
+
12
+ spec.summary = %q{.txml file importer}
13
+ spec.description = %q{Import the content of a .txml file.}
14
+ spec.homepage = "https://github.com/diasks2/txml_importer"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0")
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.9"
22
+ spec.add_development_dependency "rake", "~> 10.0"
23
+ spec.add_development_dependency "rspec"
24
+ spec.add_runtime_dependency "libxml-ruby"
25
+ spec.add_runtime_dependency "pretty_strings", "~> 0.5.0"
26
+ spec.add_runtime_dependency "charlock_holmes_bundle_icu", "~> 0.6.9.2"
27
+ end
metadata ADDED
@@ -0,0 +1,140 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: txml_importer
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Kevin S. Dias
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2016-03-14 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.9'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.9'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: libxml-ruby
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: pretty_strings
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: 0.5.0
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: 0.5.0
83
+ - !ruby/object:Gem::Dependency
84
+ name: charlock_holmes_bundle_icu
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: 0.6.9.2
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: 0.6.9.2
97
+ description: Import the content of a .txml file.
98
+ email:
99
+ - diasks2@gmail.com
100
+ executables: []
101
+ extensions: []
102
+ extra_rdoc_files: []
103
+ files:
104
+ - ".gitignore"
105
+ - ".rspec"
106
+ - ".travis.yml"
107
+ - Gemfile
108
+ - README.md
109
+ - Rakefile
110
+ - lib/txml_importer.rb
111
+ - lib/txml_importer/version.rb
112
+ - spec/spec_helper.rb
113
+ - spec/txml_importer_spec.rb
114
+ - txml_importer.gemspec
115
+ homepage: https://github.com/diasks2/txml_importer
116
+ licenses: []
117
+ metadata: {}
118
+ post_install_message:
119
+ rdoc_options: []
120
+ require_paths:
121
+ - lib
122
+ required_ruby_version: !ruby/object:Gem::Requirement
123
+ requirements:
124
+ - - ">="
125
+ - !ruby/object:Gem::Version
126
+ version: '0'
127
+ required_rubygems_version: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - ">="
130
+ - !ruby/object:Gem::Version
131
+ version: '0'
132
+ requirements: []
133
+ rubyforge_project:
134
+ rubygems_version: 2.4.1
135
+ signing_key:
136
+ specification_version: 4
137
+ summary: ".txml file importer"
138
+ test_files:
139
+ - spec/spec_helper.rb
140
+ - spec/txml_importer_spec.rb