tbx_importer 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 4f732759e6d8b7243d4ee79cc9926309cdda6ea1
4
+ data.tar.gz: 27f497202083e911be6c9ab6c8aa2ccec4d977f5
5
+ SHA512:
6
+ metadata.gz: 2c5e2f90baded7436b56c8f522b455cb4b696f857579376cdd51b8bfcff8ea300f395f47b054ed32c04e58bc7812e6e752cee63cde6815d68d1c13d4ce978580
7
+ data.tar.gz: 30e08a3ab20b63d3e3459a5f73d0d05cb69c3490b03f46e2f5f64e10345b1ff0950b2c63996b4566233d9590d89da9fa77cc64760555cd95fb66b291b742bc43
data/.gitignore ADDED
@@ -0,0 +1,9 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --format documentation
2
+ --color
data/.travis.yml ADDED
@@ -0,0 +1,3 @@
1
+ language: ruby
2
+ rvm:
3
+ - 2.2.4
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in tbx_importer.gemspec
4
+ gemspec
data/README.md ADDED
@@ -0,0 +1,71 @@
1
+ # TBX (TermBase eXchange) Importer
2
+
3
+ [![Gem Version](https://badge.fury.io/rb/tbx_importer.svg)](https://badge.fury.io/rb/tbx_importer) [![Build Status](https://travis-ci.org/diasks2/tbx_importer.png)](https://travis-ci.org/diasks2/tbx_importer) [![License](https://img.shields.io/badge/license-MIT-brightgreen.svg?style=flat)](https://github.com/diasks2/tbx_importer/blob/master/LICENSE.txt)
4
+
5
+ This gem handles the importing and parsing of [.tbx files](http://www.ttt.org/oscarStandards/tbx/tbx_oscar.pdf). [TMX files](http://www.ttt.org/tbx/) are xml files.
6
+
7
+ ## Installation
8
+
9
+ Add this line to your application's Gemfile:
10
+
11
+ **Ruby**
12
+ ```
13
+ gem install tbx_importer
14
+ ```
15
+
16
+ **Ruby on Rails**
17
+ Add this line to your application’s Gemfile:
18
+ ```ruby
19
+ gem 'tbx_importer'
20
+ ```
21
+
22
+ ## Usage
23
+
24
+ ```ruby
25
+ # Get the high level stats of a TBX file
26
+ # Including the encoding is optional. If not included the gem will attempt to detect the encoding.
27
+ file_path = File.expand_path('../tbx_importer/spec/sample_files/sample.tbx')
28
+ tbx = TbxImporter::Tbx.new(file_path: file_path)
29
+ tbx.stats
30
+ # => {:tc_count=>1, :term_count=>3, :language_pairs=>[["en", "fr"], ["en", "es"]]}
31
+
32
+ # Extract the segments of a TBX file
33
+ # Result: [term concepts, terms]
34
+ # term concepts = [tu_id, definition]
35
+ # terms = [tu_id, language, part_of_speech, term]
36
+
37
+ tbx.import
38
+ # => [[["6234-1457917153-1"], "the earth, together with all of its countries, peoples, and natural features.""], [["6234-1457917153-1", "en", "noun", world"], ["6234-1457917153-1", "fr", "noun", "monde"], ["6234-1457917153-1", "es", "noun", "mundo"]]]
39
+ ```
40
+
41
+ ## Contributing
42
+
43
+ 1. Fork it ( https://github.com/diasks2/tbx_importer/fork )
44
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
45
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
46
+ 4. Push to the branch (`git push origin my-new-feature`)
47
+ 5. Create a new Pull Request
48
+
49
+ ## License
50
+
51
+ The MIT License (MIT)
52
+
53
+ Copyright (c) 2016 Kevin S. Dias
54
+
55
+ Permission is hereby granted, free of charge, to any person obtaining a copy
56
+ of this software and associated documentation files (the "Software"), to deal
57
+ in the Software without restriction, including without limitation the rights
58
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
59
+ copies of the Software, and to permit persons to whom the Software is
60
+ furnished to do so, subject to the following conditions:
61
+
62
+ The above copyright notice and this permission notice shall be included in
63
+ all copies or substantial portions of the Software.
64
+
65
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
66
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
67
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
68
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
69
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
70
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
71
+ THE SOFTWARE.
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
@@ -0,0 +1,3 @@
1
+ module TbxImporter
2
+ VERSION = "0.1.0"
3
+ end
@@ -0,0 +1,150 @@
1
+ require 'tbx_importer/version'
2
+ require 'xml'
3
+ require 'open-uri'
4
+ require 'pretty_strings'
5
+ require 'charlock_holmes'
6
+
7
+ module TbxImporter
8
+ class Tbx
9
+ attr_reader :file_path, :encoding
10
+ def initialize(file_path:, **args)
11
+ @file_path = file_path
12
+ @content = File.read(open(@file_path)) if !args[:encoding].eql?('UTF-8')
13
+ if args[:encoding].nil?
14
+ @encoding = CharlockHolmes::EncodingDetector.detect(@content[0..100_000])[:encoding]
15
+ if @encoding.nil?
16
+ encoding_in_file = @content.dup.force_encoding('utf-8').scrub!("*").gsub!(/\0/, '').scan(/(?<=encoding=").*(?=")/)[0].upcase
17
+ if encoding_in_file.eql?('UTF-8')
18
+ @encoding = ('UTF-8')
19
+ elsif encoding_in_file.eql?('UTF-16')
20
+ @encoding = ('UTF-16LE')
21
+ end
22
+ end
23
+ else
24
+ @encoding = args[:encoding].upcase
25
+ end
26
+ @doc = {
27
+ source_language: "",
28
+ tc: { id: "", counter: 0, vals: [], lang: "", definition: "" },
29
+ term: { lang: "", counter: 0, vals: [], part_of_speech: "" },
30
+ language_pairs: [],
31
+ term_entry: false
32
+ }
33
+ raise "Encoding type could not be determined. Please set an encoding of UTF-8, UTF-16LE, or UTF-16BE" if @encoding.nil?
34
+ raise "Encoding type not supported. Please choose an encoding of UTF-8, UTF-16LE, or UTF-16BE" unless @encoding.eql?('UTF-8') || @encoding.eql?('UTF-16LE') || @encoding.eql?('UTF-16BE')
35
+ @text = CharlockHolmes::Converter.convert(@content, @encoding, 'UTF-8') if !@encoding.eql?('UTF-8')
36
+ end
37
+
38
+ def stats
39
+ if encoding.eql?('UTF-8')
40
+ analyze_stats_utf_8
41
+ else
42
+ analyze_stats_utf_16
43
+ end
44
+ {tc_count: @doc[:tc][:counter], term_count: @doc[:term][:counter], language_pairs: @doc[:language_pairs].uniq}
45
+ end
46
+
47
+ def import
48
+ reader = read_file
49
+ parse_file(reader)
50
+ [@doc[:tc][:vals], @doc[:term][:vals]]
51
+ end
52
+
53
+ private
54
+
55
+ def analyze_stats_utf_8
56
+ File.readlines(@file_path).each do |line|
57
+ analyze_line(line)
58
+ end
59
+ end
60
+
61
+ def analyze_stats_utf_16
62
+ @text.each_line do |line|
63
+ analyze_line(line)
64
+ end
65
+ end
66
+
67
+ def read_file
68
+ if encoding.eql?('UTF-8')
69
+ XML::Reader.io(open(file_path), options: XML::Parser::Options::NOERROR, encoding: XML::Encoding::UTF_8)
70
+ else
71
+ reader = @text.gsub(/(?<=encoding=").*(?=")/, 'utf-8').gsub(/&#x[0-1]?[0-9a-fA-F];/, ' ').gsub(/[\0-\x1f\x7f\u2028]/, ' ')
72
+ XML::Reader.string(reader, options: XML::Parser::Options::NOERROR, encoding: XML::Encoding::UTF_8)
73
+ end
74
+ end
75
+
76
+ def analyze_line(line)
77
+ @doc[:term_entry] = true if !line.scan(/termEntry/).nil? && !line.scan(/termEntry/).empty?
78
+ if line.scan(/(?<=lang=\S)\S+(?=')/).nil? || line.scan(/(?<=lang=\S)\S+(?=')/).empty?
79
+ language = line.scan(/(?<=lang=\S)\S+(?=")/).uniq
80
+ else
81
+ language = line.scan(/(?<=lang=\S)\S+(?=')/).uniq
82
+ end
83
+ @doc[:source_language] = language[0] if line.include?('lang=') && !language.empty? && @doc[:source_language].empty? && @doc[:term_entry]
84
+ @doc[:tc][:counter] += line.scan(/<\/termEntry>/).count
85
+ @doc[:term][:counter] += line.scan(/<\/term>/).count
86
+ if !@doc[:source_language].empty?
87
+ language.each_with_index do |lang, index|
88
+ next if @doc[:source_language].eql?(lang)
89
+ @doc[:language_pairs] << [@doc[:source_language], lang]
90
+ end
91
+ @doc[:language_pairs] = @doc[:language_pairs].uniq
92
+ end
93
+ end
94
+
95
+ def parse_file(reader)
96
+ tag_stack = []
97
+ generate_unique_id
98
+ while reader.read do
99
+ if !tag_stack.include?(reader.name)
100
+ tag_stack.push(reader.name)
101
+ eval_state(tag_stack, reader)
102
+ elsif tag_stack.last == reader.name
103
+ if tag_stack.pop.bytes.to_a == [116, 101, 114, 109, 69, 110, 116, 114, 121]
104
+ generate_unique_id
105
+ end
106
+ end
107
+ end
108
+ reader.close
109
+ end
110
+
111
+ def eval_state(tag_stack,reader)
112
+ case tag_stack.last.bytes.to_a
113
+ when [109, 97, 114, 116, 105, 102] #martif
114
+ @doc[:lang] = reader.get_attribute("lang") || reader.get_attribute("xml:lang")
115
+ @doc[:language_pairs] << @doc[:lang]
116
+ when [116, 101, 114, 109, 69, 110, 116, 114, 121] #termEntry
117
+ write_tc
118
+ when [108, 97, 110, 103, 83, 101, 116] #langSet
119
+ @doc[:term][:lang] = reader.get_attribute("lang") || reader.get_attribute("xml:lang")
120
+ @doc[:language_pairs] << @doc[:term][:lang]
121
+ when [116, 101, 114, 109] #term
122
+ write_term(reader)
123
+ when [116, 101, 114, 109, 78, 111, 116, 101] #termNote
124
+ @doc[:term][:part_of_speech] = PrettyStrings::Cleaner.new(reader.read_string.downcase).pretty.gsub("\\","&#92;").gsub("'",%q(\\\')) if reader.get_attribute("type").eql?("partOfSpeech")
125
+ @doc[:term][:vals].pop
126
+ write_term(reader)
127
+ when [100, 101, 115, 99, 114, 105, 112] #descrip
128
+ @doc[:tc][:definition] = PrettyStrings::Cleaner.new(reader.read_string).pretty.gsub("\\","&#92;").gsub("'",%q(\\\')) if reader.get_attribute("type").eql?("definition")
129
+ @doc[:tc][:vals].pop
130
+ write_tc
131
+ end
132
+ end
133
+
134
+ def write_tc
135
+ @doc[:tc][:vals] << [@doc[:tc][:id], @doc[:tc][:definition]]
136
+ @doc[:tc][:definition] = ""
137
+ end
138
+
139
+ def write_term(reader)
140
+ return if reader.read_string.nil?
141
+ text = PrettyStrings::Cleaner.new(reader.read_string).pretty.gsub("\\","&#92;").gsub("'",%q(\\\'))
142
+ word_count = text.gsub("\s+", ' ').split(' ').length
143
+ @doc[:term][:vals] << [@doc[:tc][:id], @doc[:term][:lang], @doc[:term][:part_of_speech], text]
144
+ end
145
+
146
+ def generate_unique_id
147
+ @doc[:tc][:id] = [(1..4).map{rand(10)}.join(''), Time.now.to_i, @doc[:tc][:counter] += 1 ].join("-")
148
+ end
149
+ end
150
+ end
@@ -0,0 +1,59 @@
1
+ <?xml version="1.0" encoding="UTF-8"?><martif type="TBX" xml:lang="en-US">
2
+ <martifHeader>
3
+ <fileDesc>
4
+ <titleStmt>
5
+ <title>Microsoft Terminology Collection Export</title>
6
+ </titleStmt>
7
+ <sourceDesc><p>Microsoft Terminology Collection</p></sourceDesc>
8
+ </fileDesc>
9
+ </martifHeader>
10
+ <text>
11
+ <body>
12
+ <termEntry id="2_76365">
13
+ <langSet xml:lang="en-US">
14
+ <descripGrp>
15
+ <descrip type="definition">To terminate a session with a computer accessed through a communications line usually a computer that is both distant and open to many users.</descrip>
16
+ </descripGrp>
17
+ <ntig>
18
+ <termGrp>
19
+ <term id="76365">log off</term>
20
+ <termNote type="partOfSpeech">Verb</termNote>
21
+ </termGrp>
22
+ </ntig>
23
+ </langSet>
24
+ <langSet xml:lang="fr-fr">
25
+ <ntig>
26
+ <termGrp>
27
+ <term id="215541">fermer une session</term>
28
+ <termNote type="partOfSpeech">Verb</termNote>
29
+ </termGrp>
30
+ </ntig>
31
+ </langSet>
32
+ </termEntry>
33
+ <termEntry id="3_106184"><langSet xml:lang="en-US"><descripGrp><descrip type="definition">To shorten the fractional part of a number, increasing the last remaining (rightmost) digit or not, according to whether the deleted portion was over or under five.</descrip></descripGrp><ntig><termGrp><term id="106184">round</term><termNote type="partOfSpeech">Verb</termNote></termGrp></ntig></langSet><langSet xml:lang="fr-fr"><ntig><termGrp><term id="106187">arrondir</term><termNote type="partOfSpeech">Verb</termNote></termGrp></ntig></langSet></termEntry><termEntry id="4_156085"><langSet xml:lang="en-US"><descripGrp><descrip type="definition">Personally identifiable information (PII) that is protected in special ways by law or policy.</descrip></descripGrp><ntig><termGrp><term id="156085">sensitive data</term><termNote type="partOfSpeech">Noun</termNote></termGrp></ntig></langSet><langSet xml:lang="fr-fr"><ntig><termGrp><term id="156096">données sensibles</term><termNote type="partOfSpeech">Noun</termNote></termGrp></ntig></langSet></termEntry><termEntry id="5_182425"><langSet xml:lang="en-US"><descripGrp><descrip type="definition">A business unit that is immediately under another business unit in the business hierarchy of an organization.</descrip></descripGrp><ntig><termGrp><term id="182425">child business unit</term><termNote type="partOfSpeech">Noun</termNote></termGrp></ntig></langSet><langSet xml:lang="fr-fr"><ntig><termGrp><term id="182429">sous-division</term><termNote type="partOfSpeech">Noun</termNote></termGrp></ntig></langSet></termEntry><termEntry id="7_96886"><langSet xml:lang="en-US"><descripGrp><descrip type="definition">The mode of a command-line application where it does not display confirmation messages or any other user interface items that normally appear on screen. The switch for quiet mode is typically '/q'.</descrip></descripGrp><ntig><termGrp><term id="96886">quiet mode</term><termNote type="partOfSpeech">Noun</termNote></termGrp></ntig></langSet><langSet xml:lang="fr-fr"><ntig><termGrp><term id="212192">mode silencieux</term><termNote type="partOfSpeech">Noun</termNote></termGrp></ntig></langSet></termEntry><termEntry id="9_179350"><langSet xml:lang="en-US"><descripGrp><descrip type="definition">A virtual hard disk that points to and uses an entire physical disk for the purpose of converting a data disk to a virtual hard disk. You cannot turn on a virtual machine if a linked disk is attached to the virtual machine.</descrip></descripGrp><ntig><termGrp><term id="179350">linked virtual hard disk</term><termNote type="partOfSpeech">Noun</termNote></termGrp></ntig></langSet><langSet xml:lang="fr-fr"><ntig><termGrp><term id="179352">disque dur virtuel lié</term><termNote type="partOfSpeech">Noun</termNote></termGrp></ntig></langSet></termEntry><termEntry id="10_307453"><langSet xml:lang="en-US"><descripGrp><descrip type="definition">An interface that is used to navigate the menus of a Unified Messaging (UM) system using DTMF or touchtone inputs.</descrip></descripGrp><ntig><termGrp><term id="307453">Telephone User Interface</term><termNote type="partOfSpeech">Proper Noun</termNote></termGrp></ntig></langSet><langSet xml:lang="fr-fr"><ntig><termGrp><term id="318298">interface utilisateur de téléphonie</term><termNote type="partOfSpeech">Noun</termNote></termGrp></ntig></langSet></termEntry><termEntry id="10_307467"><langSet xml:lang="en-US"><descripGrp><descrip type="definition">An interface that is used to navigate the menus of a Unified Messaging (UM) system using DTMF or touchtone inputs.</descrip></descripGrp><ntig><termGrp><term id="307467">TUI</term><termNote type="partOfSpeech">Noun</termNote></termGrp></ntig></langSet><langSet xml:lang="fr-fr"><ntig><termGrp><term id="318299">TUI</term><termNote type="partOfSpeech">Noun</termNote></termGrp></ntig></langSet></termEntry><termEntry id="10_1218391"><langSet xml:lang="en-US"><descripGrp><descrip type="definition">An interface that is used to navigate the menus of a Unified Messaging (UM) system using DTMF or touchtone inputs.</descrip></descripGrp><ntig><termGrp><term id="1218391">touchtone interface</term><termNote type="partOfSpeech">Noun</termNote></termGrp></ntig></langSet><langSet xml:lang="fr-fr"><ntig><termGrp><term id="1783210">interface à tonalité</term><termNote type="partOfSpeech">Noun</termNote></termGrp></ntig></langSet></termEntry>
34
+ <termEntry id="12_233129">
35
+ <langSet xml:lang="en-US">
36
+ <descripGrp>
37
+ <descrip type="definition">The part of a service order that specifies detailed information about the requested service.</descrip>
38
+ </descripGrp>
39
+ <ntig>
40
+ <termGrp>
41
+ <term id="233129">service order line</term>
42
+ <termNote type="partOfSpeech">Noun</termNote>
43
+ </termGrp></ntig>
44
+ </langSet>
45
+ <langSet xml:lang="fr-fr">
46
+ <ntig>
47
+ <termGrp>
48
+ <term id="324824">ligne d'ordre de service</term>
49
+ <termNote type="partOfSpeech">Noun</termNote>
50
+ </termGrp>
51
+ </ntig>
52
+ <ntig>
53
+ <termGrp>
54
+ <term id="1950253">ligne commande service</term>
55
+ <termNote type="partOfSpeech">Noun</termNote>
56
+ </termGrp>
57
+ </ntig>
58
+ </langSet>
59
+ </termEntry><termEntry id="15_397203"><langSet xml:lang="en-US"><descripGrp><descrip type="definition">A logical grouping of managed folders. When a managed folder mailbox policy is applied to a user’s mailbox, all the managed folders that are linked to the policy are deployed in a single operation, thereby making the deployment of messaging records management (MRM) easier.</descrip></descripGrp><ntig><termGrp><term id="397203">managed folder mailbox policy</term><termNote type="partOfSpeech">Noun</termNote></termGrp></ntig></langSet><langSet xml:lang="fr-fr"><ntig><termGrp><term id="410598">Stratégie de boîte aux lettres de dossier géré</term><termNote type="partOfSpeech">Noun</termNote></termGrp></ntig></langSet></termEntry><termEntry id="16_494929"><langSet xml:lang="en-US"><descripGrp><descrip type="definition">The space between the inside edge of a child element and its content.</descrip></descripGrp><ntig><termGrp><term id="494929">padding</term><termNote type="partOfSpeech">Noun</termNote></termGrp></ntig></langSet><langSet xml:lang="fr-fr"><ntig><termGrp><term id="499409">marge intérieure</term><termNote type="partOfSpeech">Noun</termNote></termGrp></ntig></langSet></termEntry><termEntry id="20_103799"><langSet xml:lang="en-US"><descripGrp><descrip type="definition">For failover clusters or server clusters, a physical or logical entity that is capable of being managed by a cluster, brought online and taken offline, and moved between nodes. A resource can be owned only by a single node at any point in time.</descrip></descripGrp><ntig><termGrp><term id="103799">resource</term><termNote type="partOfSpeech">Noun</termNote></termGrp></ntig></langSet><langSet xml:lang="fr-fr"><ntig><termGrp><term id="103802">ressource</term><termNote type="partOfSpeech">Noun</termNote></termGrp></ntig></langSet></termEntry><termEntry id="21_216347"><langSet xml:lang="en-US"><descripGrp><descrip type="definition">A field that can be used in a workflow rule or template as a placeholder for specific values or text in an e-mail. When you send an e-mail, these placeholders are replaced with the data that meet the conditions that have been set in the workflow rule. For example, if you send an e-mail to an account holder about a contract cancellation date, you can use a dynamic data field to automatically find and insert the correct cancellation date for that contract.</descrip></descripGrp><ntig><termGrp><term id="216347">dynamic data field</term><termNote type="partOfSpeech">Noun</termNote></termGrp></ntig></langSet><langSet xml:lang="fr-fr"><ntig><termGrp><term id="218231">champ de données dynamiques</term><termNote type="partOfSpeech">Noun</termNote></termGrp></ntig></langSet></termEntry><termEntry id="22_364026"><langSet xml:lang="en-US"><descripGrp><descrip type="definition">A component of the World Wide Web Publishing Service (WWW service) in IIS that is responsible for configuration, by means of the metabase, and for worker process management.</descrip></descripGrp><ntig><termGrp><term id="364026">WWW Service Administration and Monitoring component</term><termNote type="partOfSpeech">Noun</termNote></termGrp></ntig></langSet><langSet xml:lang="fr-fr"><ntig><termGrp><term id="365290">composant Analyse et administration du service WWW</term><termNote type="partOfSpeech">Noun</termNote></termGrp></ntig></langSet></termEntry><termEntry id="23_461846"><langSet xml:lang="en-US"><descripGrp><descrip type="definition">A tool on the main Windows SteadyState interface used to set schedules for software and operating system updates.</descrip></descripGrp><ntig><termGrp><term id="461846">Schedule Important Software Updates</term><termNote type="partOfSpeech">Proper Noun</termNote></termGrp></ntig></langSet><langSet xml:lang="fr-fr"><ntig><termGrp><term id="514693">Planification de mises à jour logicielles importantes</term><termNote type="partOfSpeech">Proper Noun</termNote></termGrp></ntig></langSet></termEntry><termEntry id="24_537631"><langSet xml:lang="en-US"><descripGrp><descrip type="definition">A component that can be registered as part of the ASP.NET request lifecycle and that can read or change the request or response as it is processed. HttpModules are often used to perform special tasks that need to monitor each request, such as security or site statistics.</descrip></descripGrp><ntig><termGrp><term id="537631">HTTP module</term><termNote type="partOfSpeech">Noun</termNote></termGrp></ntig></langSet><langSet xml:lang="fr-fr"><ntig><termGrp><term id="541917">module HTTP</term><termNote type="partOfSpeech">Noun</termNote></termGrp></ntig></langSet></termEntry><termEntry id="25_42999"><langSet xml:lang="en-US"><descripGrp><descrip type="definition">In communications, the portion of a Data Terminal Equipment (DTE) device that sends data.</descrip></descripGrp><ntig><termGrp><term id="42999">data source</term><termNote type="partOfSpeech">Noun</termNote></termGrp></ntig></langSet><langSet xml:lang="fr-fr"><ntig><termGrp><term id="43001">source de données</term><termNote type="partOfSpeech">Noun</termNote></termGrp></ntig></langSet></termEntry><termEntry id="26_147281"><langSet xml:lang="en-US"><descripGrp><descrip type="definition">To deliver data to a client without a client request for the data.</descrip></descripGrp><ntig><termGrp><term id="147281">push</term><termNote type="partOfSpeech">Verb</termNote></termGrp></ntig></langSet><langSet xml:lang="fr-fr"><ntig><termGrp><term id="147288">effectuer une transmission de type push</term><termNote type="partOfSpeech">Verb</termNote></termGrp></ntig></langSet></termEntry><termEntry id="27_136378"><langSet xml:lang="en-US"><descripGrp><descrip type="definition">A transmission medium designed for high-speed data transfers over long distances. Cable modem services and Digital Subscriber Line (DSL) are examples of broadband networks.</descrip></descripGrp><ntig><termGrp><term id="136378">broadband</term><termNote type="partOfSpeech">Noun</termNote></termGrp></ntig></langSet><langSet xml:lang="fr-fr"><ntig><termGrp><term id="212456">haut débit</term><termNote type="partOfSpeech">Noun</termNote></termGrp></ntig></langSet></termEntry><termEntry id="27_216700"><langSet xml:lang="en-US"><descripGrp><descrip type="definition">A transmission medium designed for high-speed data transfers over long distances. Cable modem services and Digital Subscriber Line (DSL) are examples of broadband networks.</descrip></descripGrp><ntig><termGrp><term id="216700">broadband network</term><termNote type="partOfSpeech">Noun</termNote></termGrp></ntig></langSet><langSet xml:lang="fr-fr"><ntig><termGrp><term id="216717">réseau haut débit</term><termNote type="partOfSpeech">Noun</termNote></termGrp></ntig></langSet></termEntry><termEntry id="29_45263"><langSet xml:lang="en-US"><descripGrp><descrip type="definition">A mark placed over, under, or through a character, usually to indicate a change in phonetic value from the unmarked state.</descrip></descripGrp><ntig><termGrp><term id="45263">diacritical mark</term><termNote type="partOfSpeech">Noun</termNote></termGrp></ntig></langSet><langSet xml:lang="fr-fr"><ntig><termGrp><term id="45266">signe diacritique</term><termNote type="partOfSpeech">Noun</termNote></termGrp></ntig></langSet></termEntry><termEntry id="29_45264"><langSet xml:lang="en-US"><descripGrp><descrip type="definition">A mark placed over, under, or through a character, usually to indicate a change in phonetic value from the unmarked state.</descrip></descripGrp><ntig><termGrp><term id="45264">diacritic</term><termNote type="partOfSpeech">Noun</termNote></termGrp></ntig></langSet><langSet xml:lang="fr-fr"><ntig><termGrp><term id="45266">signe diacritique</term><termNote type="partOfSpeech">Noun</termNote></termGrp></ntig></langSet></termEntry><termEntry id="30_237215"><langSet xml:lang="en-US"><descripGrp><descrip type="definition">The part of a scenario that determines its uniqueness from other scenarios. </descrip></descripGrp><ntig><termGrp><term id="237215">differentiating factor</term><termNote type="partOfSpeech">Noun</termNote></termGrp></ntig></langSet><langSet xml:lang="fr-fr"><ntig><termGrp><term id="276593">facteur de différenciation</term><termNote type="partOfSpeech">Noun</termNote></termGrp></ntig></langSet></termEntry><termEntry id="31_1167251"><langSet xml:lang="en-US"><descripGrp><descrip type="definition">A shared service in Windows SharePoint Services and SharePoint Server that provides a means for storing, securing, and administering external content types and related objects. </descrip></descripGrp><ntig><termGrp><term id="1167251">Business Data Connectivity service</term><termNote type="partOfSpeech">Proper Noun</termNote></termGrp></ntig></langSet><langSet xml:lang="fr-fr"><ntig><termGrp><term id="1167718">Service BDC</term><termNote type="partOfSpeech">Proper Noun</termNote></termGrp></ntig></langSet></termEntry></body></text></martif>
@@ -0,0 +1,2 @@
1
+ $LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
2
+ require 'tbx_importer'
@@ -0,0 +1,83 @@
1
+ require 'spec_helper'
2
+
3
+ describe TbxImporter do
4
+ it 'has a version number' do
5
+ expect(TbxImporter::VERSION).not_to be nil
6
+ end
7
+
8
+ describe '#stats' do
9
+ it 'reports the stats of a UTF-8 TBX file' do
10
+ file_path = File.expand_path('../tbx_importer/spec/sample_files/sample_1(utf-8).tbx')
11
+ tbx = TbxImporter::Tbx.new(file_path: file_path)
12
+ expect(tbx.stats).to eq({:tc_count=>25, :term_count=>51, :language_pairs=>[["en-US", "fr-fr"]]})
13
+ end
14
+
15
+ it 'reports the stats of a UTF-16 TBX file' do
16
+ file_path = File.expand_path('../tbx_importer/spec/sample_files/sample_1(utf-16).tbx')
17
+ tbx = TbxImporter::Tbx.new(file_path: file_path)
18
+ expect(tbx.stats).to eq({:tc_count=>10, :term_count=>21, :language_pairs=>[["EN-US", "PL"], ['EN-US', 'FR']]})
19
+ end
20
+ end
21
+
22
+ describe '#import' do
23
+ it 'imports a UTF-8 TBX file' do
24
+ file_path = File.expand_path('../tbx_importer/spec/sample_files/sample_1(utf-8).tbx')
25
+ tbx = TbxImporter::Tbx.new(file_path: file_path).import
26
+ expect(tbx[0].length).to eq(25)
27
+ end
28
+
29
+ it 'imports a UTF-8 TBX file 2' do
30
+ file_path = File.expand_path('../tbx_importer/spec/sample_files/sample_1(utf-8).tbx')
31
+ tbx = TbxImporter::Tbx.new(file_path: file_path).import
32
+ expect(tbx[1].length).to eq(51)
33
+ end
34
+
35
+ it 'imports a UTF-8 TBX file 3' do
36
+ file_path = File.expand_path('../tbx_importer/spec/sample_files/sample_1(utf-8).tbx')
37
+ tbx = TbxImporter::Tbx.new(file_path: file_path).import
38
+ expect(tbx[0][-1][0]).to eq(tbx[1][-1][0])
39
+ end
40
+
41
+ it 'imports a UTF-8 TBX file 4' do
42
+ file_path = File.expand_path('../tbx_importer/spec/sample_files/sample_1(utf-8).tbx')
43
+ tbx = TbxImporter::Tbx.new(file_path: file_path).import
44
+ expect(tbx[0][1][1]).to eq("To shorten the fractional part of a number, increasing the last remaining (rightmost) digit or not, according to whether the deleted portion was over or under five.")
45
+ end
46
+
47
+ it 'imports a UTF-8 TBX file 5' do
48
+ file_path = File.expand_path('../tbx_importer/spec/sample_files/sample_1(utf-8).tbx')
49
+ tbx = TbxImporter::Tbx.new(file_path: file_path).import
50
+ expect(tbx[1][1][2]).to eq("verb")
51
+ end
52
+
53
+ it 'imports a UTF-16 TBX file' do
54
+ file_path = File.expand_path('../tbx_importer/spec/sample_files/sample_1(utf-16).tbx')
55
+ tbx = TbxImporter::Tbx.new(file_path: file_path).import
56
+ expect(tbx[0].length).to eq(10)
57
+ end
58
+
59
+ it 'imports a UTF-16 TBX file 2' do
60
+ file_path = File.expand_path('../tbx_importer/spec/sample_files/sample_1(utf-16).tbx')
61
+ tbx = TbxImporter::Tbx.new(file_path: file_path).import
62
+ expect(tbx[1].length).to eq(21)
63
+ end
64
+
65
+ it 'imports a UTF-16 TBX file 3' do
66
+ file_path = File.expand_path('../tbx_importer/spec/sample_files/sample_1(utf-16).tbx')
67
+ tbx = TbxImporter::Tbx.new(file_path: file_path).import
68
+ expect(tbx[0][-1][0]).to eq(tbx[1][-1][0])
69
+ end
70
+
71
+ it 'imports a UTF-16 TBX file 4' do
72
+ file_path = File.expand_path('../tbx_importer/spec/sample_files/sample_1(utf-16).tbx')
73
+ tbx = TbxImporter::Tbx.new(file_path: file_path).import
74
+ expect(tbx[0][1][1]).to eq('')
75
+ end
76
+
77
+ it 'imports a UTF-16 TBX file 5' do
78
+ file_path = File.expand_path('../tbx_importer/spec/sample_files/sample_1(utf-16).tbx')
79
+ tbx = TbxImporter::Tbx.new(file_path: file_path).import
80
+ expect(tbx[1][1][2]).to eq('')
81
+ end
82
+ end
83
+ end
@@ -0,0 +1,27 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'tbx_importer/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "tbx_importer"
8
+ spec.version = TbxImporter::VERSION
9
+ spec.authors = ["Kevin S. Dias"]
10
+ spec.email = ["diasks2@gmail.com"]
11
+
12
+ spec.summary = %q{TBX (TermBase eXchange) file importer}
13
+ spec.description = %q{Import the content of a TBX (TermBase eXchange) file}
14
+ spec.homepage = "https://github.com/diasks2/tbx_importer"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0")
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.9"
22
+ spec.add_development_dependency "rake", "~> 10.0"
23
+ spec.add_development_dependency "rspec"
24
+ spec.add_runtime_dependency "libxml-ruby"
25
+ spec.add_runtime_dependency "pretty_strings", "~> 0.7.0"
26
+ spec.add_runtime_dependency "charlock_holmes_bundle_icu", "~> 0.6.9.2"
27
+ end
metadata ADDED
@@ -0,0 +1,144 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: tbx_importer
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Kevin S. Dias
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2016-03-17 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.9'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.9'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: libxml-ruby
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: pretty_strings
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: 0.7.0
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: 0.7.0
83
+ - !ruby/object:Gem::Dependency
84
+ name: charlock_holmes_bundle_icu
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: 0.6.9.2
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: 0.6.9.2
97
+ description: Import the content of a TBX (TermBase eXchange) file
98
+ email:
99
+ - diasks2@gmail.com
100
+ executables: []
101
+ extensions: []
102
+ extra_rdoc_files: []
103
+ files:
104
+ - ".gitignore"
105
+ - ".rspec"
106
+ - ".travis.yml"
107
+ - Gemfile
108
+ - README.md
109
+ - Rakefile
110
+ - lib/tbx_importer.rb
111
+ - lib/tbx_importer/version.rb
112
+ - spec/sample_files/sample_1(utf-16).tbx
113
+ - spec/sample_files/sample_1(utf-8).tbx
114
+ - spec/spec_helper.rb
115
+ - spec/tbx_importer_spec.rb
116
+ - tbx_importer.gemspec
117
+ homepage: https://github.com/diasks2/tbx_importer
118
+ licenses: []
119
+ metadata: {}
120
+ post_install_message:
121
+ rdoc_options: []
122
+ require_paths:
123
+ - lib
124
+ required_ruby_version: !ruby/object:Gem::Requirement
125
+ requirements:
126
+ - - ">="
127
+ - !ruby/object:Gem::Version
128
+ version: '0'
129
+ required_rubygems_version: !ruby/object:Gem::Requirement
130
+ requirements:
131
+ - - ">="
132
+ - !ruby/object:Gem::Version
133
+ version: '0'
134
+ requirements: []
135
+ rubyforge_project:
136
+ rubygems_version: 2.4.1
137
+ signing_key:
138
+ specification_version: 4
139
+ summary: TBX (TermBase eXchange) file importer
140
+ test_files:
141
+ - spec/sample_files/sample_1(utf-16).tbx
142
+ - spec/sample_files/sample_1(utf-8).tbx
143
+ - spec/spec_helper.rb
144
+ - spec/tbx_importer_spec.rb