chem_scanner 0.1.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +13 -0
- data/.rspec +3 -0
- data/.rubocop.yml +604 -0
- data/.ruby-gemset +1 -0
- data/.ruby-version +1 -0
- data/.travis.yml +5 -0
- data/CODE_OF_CONDUCT.md +74 -0
- data/Gemfile +20 -0
- data/LICENSE.txt +661 -0
- data/README.md +177 -0
- data/Rakefile +8 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/chem_scanner.gemspec +43 -0
- data/lib/chem_scanner.rb +79 -0
- data/lib/chem_scanner/cdx.rb +67 -0
- data/lib/chem_scanner/cdxml.rb +72 -0
- data/lib/chem_scanner/chem_draw/cdx_reader.rb +101 -0
- data/lib/chem_scanner/chem_draw/node/base_node.rb +123 -0
- data/lib/chem_scanner/chem_draw/node/base_value.rb +257 -0
- data/lib/chem_scanner/chem_draw/node/bond.rb +100 -0
- data/lib/chem_scanner/chem_draw/node/bracket_attachment.rb +17 -0
- data/lib/chem_scanner/chem_draw/node/bracket_group.rb +32 -0
- data/lib/chem_scanner/chem_draw/node/chem_geometry.rb +58 -0
- data/lib/chem_scanner/chem_draw/node/color_table.rb +46 -0
- data/lib/chem_scanner/chem_draw/node/font_table.rb +54 -0
- data/lib/chem_scanner/chem_draw/node/fragment.rb +149 -0
- data/lib/chem_scanner/chem_draw/node/fragment_node.rb +145 -0
- data/lib/chem_scanner/chem_draw/node/graphic.rb +94 -0
- data/lib/chem_scanner/chem_draw/node/text.rb +242 -0
- data/lib/chem_scanner/chem_draw/parser.rb +214 -0
- data/lib/chem_scanner/chem_draw/yaml/cdx_objects.yaml +32 -0
- data/lib/chem_scanner/chem_draw/yaml/cdx_props.yaml +263 -0
- data/lib/chem_scanner/chem_draw/yaml/cdxml_objects.yaml +36 -0
- data/lib/chem_scanner/chem_draw/yaml/cdxml_props.yaml +263 -0
- data/lib/chem_scanner/chem_draw/yaml/props_data_type.yaml +263 -0
- data/lib/chem_scanner/configuration/abbreviation.rb +76 -0
- data/lib/chem_scanner/configuration/superatom.rb +76 -0
- data/lib/chem_scanner/configuration/superatom.txt +2874 -0
- data/lib/chem_scanner/configuration/util.rb +40 -0
- data/lib/chem_scanner/configuration/yaml/abbreviations.yaml +6399 -0
- data/lib/chem_scanner/configuration/yaml/elements.yaml +115 -0
- data/lib/chem_scanner/configuration/yaml/solvents.yaml +16 -0
- data/lib/chem_scanner/doc.rb +56 -0
- data/lib/chem_scanner/docx.rb +86 -0
- data/lib/chem_scanner/export/cml.rb +176 -0
- data/lib/chem_scanner/extension/element_map.rb +9 -0
- data/lib/chem_scanner/extension/geometry/bounding_box.rb +84 -0
- data/lib/chem_scanner/extension/geometry/line.rb +123 -0
- data/lib/chem_scanner/extension/geometry/point.rb +18 -0
- data/lib/chem_scanner/extension/geometry/polygon.rb +115 -0
- data/lib/chem_scanner/extension/geometry/segment.rb +196 -0
- data/lib/chem_scanner/extension/passthrough.rb +7 -0
- data/lib/chem_scanner/interpreter/element/arrow.rb +298 -0
- data/lib/chem_scanner/interpreter/element/atom.rb +134 -0
- data/lib/chem_scanner/interpreter/element/fragment.rb +59 -0
- data/lib/chem_scanner/interpreter/element/molecule.rb +473 -0
- data/lib/chem_scanner/interpreter/element/molecule_group.rb +34 -0
- data/lib/chem_scanner/interpreter/element/reaction.rb +186 -0
- data/lib/chem_scanner/interpreter/element/reaction_step.rb +39 -0
- data/lib/chem_scanner/interpreter/formula_to_mol.rb +75 -0
- data/lib/chem_scanner/interpreter/post_process/assemble.rb +38 -0
- data/lib/chem_scanner/interpreter/post_process/label_by_molecule.rb +37 -0
- data/lib/chem_scanner/interpreter/post_process/reaction_info.rb +225 -0
- data/lib/chem_scanner/interpreter/post_process/reaction_step.rb +95 -0
- data/lib/chem_scanner/interpreter/post_process/reagent_label.rb +46 -0
- data/lib/chem_scanner/interpreter/post_process/text_as_molecule.rb +52 -0
- data/lib/chem_scanner/interpreter/post_process/text_label.rb +40 -0
- data/lib/chem_scanner/interpreter/pre_process/arrow.rb +197 -0
- data/lib/chem_scanner/interpreter/pre_process/graphic.rb +41 -0
- data/lib/chem_scanner/interpreter/pre_process/molecule.rb +150 -0
- data/lib/chem_scanner/interpreter/reaction_detection/assign_to_reaction.rb +129 -0
- data/lib/chem_scanner/interpreter/reaction_detection/duplicate_reagents.rb +50 -0
- data/lib/chem_scanner/interpreter/reaction_detection/molecule_group.rb +55 -0
- data/lib/chem_scanner/interpreter/reaction_detection/multi_line_chain_reaction.rb +85 -0
- data/lib/chem_scanner/interpreter/reaction_detection/remove_separated_mol.rb +115 -0
- data/lib/chem_scanner/interpreter/reaction_detection/text_assignment.rb +166 -0
- data/lib/chem_scanner/interpreter/scheme.rb +173 -0
- data/lib/chem_scanner/interpreter/scheme_base.rb +64 -0
- data/lib/chem_scanner/interpreter/text_group/bold_groups.rb +183 -0
- data/lib/chem_scanner/interpreter/text_group/molecule_text_group.rb +138 -0
- data/lib/chem_scanner/interpreter/text_group/reaction_text_groups.rb +221 -0
- data/lib/chem_scanner/interpreter/text_group/retrieve_alias_info.rb +41 -0
- data/lib/chem_scanner/interpreter/text_group/retrieve_n_atoms.rb +106 -0
- data/lib/chem_scanner/interpreter/text_group/text_group_interpreter.rb +92 -0
- data/lib/chem_scanner/perkin_eln.rb +287 -0
- data/lib/chem_scanner/version.rb +5 -0
- data/lib/rubygems_plugin.rb +5 -0
- metadata +244 -0
data/README.md
ADDED
@@ -0,0 +1,177 @@
|
|
1
|
+
|
2
|
+
# Introduction
|
3
|
+
|
4
|
+
The `ChemScanner` library attempts to extract and interpret reactions/molecules information from ChemDraw-related files format: CDX, CDXML, embedded CDX within DOC and DOCX, [Perkin Elmer ELN](http://www.perkinelmer.com/category/notebook).
|
5
|
+
|
6
|
+
# Installation
|
7
|
+
|
8
|
+
Add this line to your application's Gemfile:
|
9
|
+
|
10
|
+
```ruby
|
11
|
+
gem 'chem_scanner'
|
12
|
+
```
|
13
|
+
|
14
|
+
And then execute:
|
15
|
+
|
16
|
+
$ bundle
|
17
|
+
|
18
|
+
Or install it yourself as:
|
19
|
+
|
20
|
+
$ gem install chem_scanner
|
21
|
+
|
22
|
+
# UI for ChemScanner
|
23
|
+
You can try the `ChemScanner` at https://eln.chemotion.net/ or https://eln.chemotion.net/chemscanner. The UI is more user-friendly which some additional features:
|
24
|
+
|
25
|
+
- Export to Excel and CML.
|
26
|
+
- Preview of the original scheme.
|
27
|
+
- Import directly to [Chemotion ELN](https://eln.chemotion.net)
|
28
|
+
- Add comment for each extracted scheme. These comments would also appear in the export and Chemotion ELN imported molecules/reactions.
|
29
|
+
- ...
|
30
|
+
|
31
|
+
# Usage
|
32
|
+
|
33
|
+
To scan/extract a single CDX file
|
34
|
+
|
35
|
+
```ruby
|
36
|
+
require 'chem_scanner'
|
37
|
+
|
38
|
+
cdx = ChemScanner::Cdx.new
|
39
|
+
cdx.read('/path/to/cdx/file')
|
40
|
+
# Get array of scanned Canonical SMILES
|
41
|
+
cdx.molecules.map(&:get_cano_smiles)
|
42
|
+
# Get array of scanned Reactions in SMILES
|
43
|
+
cdx.reactions.map(&:reaction_smiles)
|
44
|
+
```
|
45
|
+
There are 5 classes correspond to 5 supported file formats: CDX, CDXML, DOC, DOCX, PerkinELN.
|
46
|
+
|
47
|
+
# API
|
48
|
+
|
49
|
+
## Molecule
|
50
|
+
|
51
|
+
- Access "scanned" molecules
|
52
|
+
|
53
|
+
```ruby
|
54
|
+
# Molecules - array of scanned molecules
|
55
|
+
cdx.molecules
|
56
|
+
# Get array of scanned Canonical SMILES
|
57
|
+
cdx.molecules.map(&:get_cano_smiles)
|
58
|
+
# Get one molecule
|
59
|
+
molecule = cdx.molecules.first
|
60
|
+
# Number of scanned molecules
|
61
|
+
cdx.molecules.count
|
62
|
+
```
|
63
|
+
|
64
|
+
- Molecule class:
|
65
|
+
|
66
|
+
```ruby
|
67
|
+
# Canonical SMILES
|
68
|
+
molecule.get_cano_smiles
|
69
|
+
# Molfile
|
70
|
+
molecule.get_mdl
|
71
|
+
# RDKIT RWMol (https://www.rdkit.org/docs/cppapi/classRDKit_1_1RWMol.html)
|
72
|
+
molecule.rw_mol
|
73
|
+
# Molecule label (bold text near molecule)
|
74
|
+
molecule.label
|
75
|
+
# Molecule text (molecule description)
|
76
|
+
molecule.text
|
77
|
+
# Molecule details (additional information from Perkin Elmer ELN)
|
78
|
+
molecule.details
|
79
|
+
```
|
80
|
+
We are using a [ruby-binding version](https://github.com/CamAnNguyen/rdkit_chem) of `RDKit` as a dependency of `ChemScanner`.
|
81
|
+
|
82
|
+
## Reaction
|
83
|
+
|
84
|
+
Reaction consist of 3 groups of molecules: `reactants`, `reagents` and `products`. Each group is and array of molecules, which each element is an object of `Molecule` class. In addition, some abbreviations belong to the reaction are represented by SMILES. Those could be access via `reagent_smiles`
|
85
|
+
|
86
|
+
```ruby
|
87
|
+
reaction = cdx.reactions.first
|
88
|
+
# Access extracted structure group
|
89
|
+
reactants = reaction.reactants
|
90
|
+
reagents = reaction.reagents
|
91
|
+
products = reaction.products
|
92
|
+
reagent_smiles = reaction.reagent_smiles
|
93
|
+
```
|
94
|
+
|
95
|
+
Further manipulation of each group would be similar to `Molecule` class.
|
96
|
+
|
97
|
+
- **Reaction properties**
|
98
|
+
|
99
|
+
Reaction itself has `description`, `yield`, `time`, `temperature` and `details` properties. All these properties are extracted from the ChemDraw scheme, excep `details` field are additional information from `PerkinELN`.
|
100
|
+
|
101
|
+
- **Reaction step**
|
102
|
+
|
103
|
+
Some multi-step reactions can also be recognized. If a reaction is a multi-step reaction, the "steps" could be accessed via:
|
104
|
+
|
105
|
+
```ruby
|
106
|
+
# Get first scanned reaction
|
107
|
+
reaction = cdx.reactions.first
|
108
|
+
# Access first step
|
109
|
+
step = reaction.steps.first
|
110
|
+
step.number # Should be 1
|
111
|
+
step.description
|
112
|
+
step.time
|
113
|
+
step.temperature
|
114
|
+
# List reagents SMILES
|
115
|
+
step.reagents
|
116
|
+
```
|
117
|
+
|
118
|
+
Each step has these following properties: `description`, `time`, `temperature`, and `reagents`
|
119
|
+
|
120
|
+
## Supported File Formats
|
121
|
+
|
122
|
+
CDX, CDXML, PerkinELN usage and API are described above. Their outputs are simple `molecules` and `reactions`.
|
123
|
+
|
124
|
+
DOC and DOCX classes are little bit different. Since DOC and DOCX file can contain more than 1 embedded ChemDraw schemes, which each embedded scheme is 1 CDX scheme.
|
125
|
+
`ChemScanner` attempts to extract all of them and put into one `Hash` map, called `cdx_map`.
|
126
|
+
|
127
|
+
```ruby
|
128
|
+
require 'chem_scanner'
|
129
|
+
|
130
|
+
doc = ChemScanner::Doc.new
|
131
|
+
doc.read('/path/to/doc/file')
|
132
|
+
doc.cdx_map.each do |key, cdx|
|
133
|
+
puts cdx.reactions.map(&:reaction_smiles)
|
134
|
+
end
|
135
|
+
|
136
|
+
# Access all molecules in all CDXs
|
137
|
+
doc.molecules.map(&:get_cano_smiles)
|
138
|
+
# Access all reactions in all CDXs
|
139
|
+
doc.reactions.map(&:get_cano_smiles)
|
140
|
+
```
|
141
|
+
|
142
|
+
DOCX is a bit different, `ChemScanner` can extract the CDX together with its preview image within the documents.
|
143
|
+
|
144
|
+
```ruby
|
145
|
+
require 'chem_scanner'
|
146
|
+
|
147
|
+
docx = ChemScanner::Docx.new
|
148
|
+
docx.read('/path/to/docx/file')
|
149
|
+
docx.cdx_map.each do |key, cdx_info|
|
150
|
+
# Get the CDX scheme
|
151
|
+
cdx = cdx_info[:cdx]
|
152
|
+
puts cdx.reactions.map(&:reaction_smiles)
|
153
|
+
|
154
|
+
# Preview images, used for ChemScanner UI
|
155
|
+
img_ext = cdx_info[:img_ext] # Could be '.png', '.emf'
|
156
|
+
img_b64 = cdx_info[:img_b64] # Base64 encoded of image
|
157
|
+
end
|
158
|
+
|
159
|
+
# Access all molecules in all CDXs
|
160
|
+
docx.molecules.map(&:get_cano_smiles)
|
161
|
+
# Access all reactions in all CDXs
|
162
|
+
docx.reactions.map(&:get_cano_smiles)
|
163
|
+
```
|
164
|
+
|
165
|
+
# Development
|
166
|
+
|
167
|
+
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
168
|
+
|
169
|
+
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
170
|
+
|
171
|
+
# Contributing
|
172
|
+
|
173
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/CamAnNguyen/chem_scanner. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [Contributor Covenant](http://contributor-covenant.org) code of conduct.
|
174
|
+
|
175
|
+
# License
|
176
|
+
|
177
|
+
The gem is available as open source under the terms of the [GNU AGPLv3 License](https://www.gnu.org/licenses/agpl-3.0.en.html).
|
data/Rakefile
ADDED
data/bin/console
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "bundler/setup"
|
4
|
+
require "chem_scanner"
|
5
|
+
|
6
|
+
# You can add fixtures and/or initialization code here to make experimenting
|
7
|
+
# with your gem easier. You can also use a different console, if you like.
|
8
|
+
|
9
|
+
# (If you use this, don't forget to add pry to your Gemfile!)
|
10
|
+
# require "pry"
|
11
|
+
# Pry.start
|
12
|
+
|
13
|
+
require "irb"
|
14
|
+
IRB.start(__FILE__)
|
data/bin/setup
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
lib = File.expand_path("lib", __dir__)
|
4
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
5
|
+
|
6
|
+
require "chem_scanner/version"
|
7
|
+
|
8
|
+
# rubocop:disable Metrics/BlockLength
|
9
|
+
Gem::Specification.new do |spec|
|
10
|
+
spec.name = "chem_scanner"
|
11
|
+
spec.version = ChemScanner::VERSION
|
12
|
+
spec.authors = ["an.nguyen"]
|
13
|
+
spec.email = ["an.nguyen@kit.edu"]
|
14
|
+
|
15
|
+
spec.summary = "Extraction of chemical information"
|
16
|
+
spec.description = "ChemScanner is a chemical utiliy to extract " \
|
17
|
+
"chemical information from various scientific formats"
|
18
|
+
spec.homepage = "https://chemotion.net"
|
19
|
+
spec.license = "MIT"
|
20
|
+
|
21
|
+
spec.files = Dir.chdir(File.expand_path(__dir__)) do
|
22
|
+
`git ls-files -z`.split("\x0").reject do |f|
|
23
|
+
f.match(%r{^(test|spec|features)/})
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
spec.bindir = "exe"
|
28
|
+
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
29
|
+
spec.require_paths = ["lib"]
|
30
|
+
|
31
|
+
spec.required_ruby_version = ">= 2.3"
|
32
|
+
|
33
|
+
spec.add_development_dependency "bundler", ">= 1.16"
|
34
|
+
spec.add_development_dependency "rake", ">= 10.0"
|
35
|
+
spec.add_development_dependency "rspec", ">= 3.0"
|
36
|
+
|
37
|
+
spec.add_dependency "chronic_duration", ">= 0.10"
|
38
|
+
spec.add_dependency "nokogiri", ">= 1.8"
|
39
|
+
spec.add_dependency "rdkit_chem"
|
40
|
+
spec.add_dependency "ruby-geometry", ">= 0.0.6"
|
41
|
+
spec.add_dependency "ruby-ole", ">= 1.2"
|
42
|
+
end
|
43
|
+
# rubocop:enable Metrics/BlockLength
|
data/lib/chem_scanner.rb
ADDED
@@ -0,0 +1,79 @@
|
|
1
|
+
require "yaml"
|
2
|
+
require "rdkit_chem"
|
3
|
+
require "ostruct"
|
4
|
+
require "forwardable"
|
5
|
+
|
6
|
+
# ChemScanner main module
|
7
|
+
module ChemScanner
|
8
|
+
(
|
9
|
+
Gem.find_files("chem_scanner/extension/*/*.rb") +
|
10
|
+
Gem.find_files("chem_scanner/extension/*.rb") +
|
11
|
+
Gem.find_files("chem_scanner/configuration/*.rb")
|
12
|
+
).each { |file| require file }
|
13
|
+
|
14
|
+
@superatom = Superatom.instance
|
15
|
+
@abbreviation = Abbreviation.instance
|
16
|
+
|
17
|
+
def self.sync_custom_superatom
|
18
|
+
@superatom.sync_custom
|
19
|
+
end
|
20
|
+
|
21
|
+
def self.all_superatoms
|
22
|
+
@superatom.all
|
23
|
+
end
|
24
|
+
|
25
|
+
def self.predefined_superatoms
|
26
|
+
@superatom.predefined
|
27
|
+
end
|
28
|
+
|
29
|
+
def self.custom_superatoms
|
30
|
+
@superatom.custom
|
31
|
+
end
|
32
|
+
|
33
|
+
def self.get_superatom(superatom)
|
34
|
+
@superatom.get_superatom(superatom)
|
35
|
+
end
|
36
|
+
|
37
|
+
def self.add_superatom(satom, smi)
|
38
|
+
@superatom.add(satom, smi)
|
39
|
+
end
|
40
|
+
|
41
|
+
def self.remove_superatom(satom)
|
42
|
+
@superatom.remove(satom)
|
43
|
+
end
|
44
|
+
|
45
|
+
def self.predefined_abbreviations
|
46
|
+
@abbreviation.predefined
|
47
|
+
end
|
48
|
+
|
49
|
+
def self.solvents
|
50
|
+
@abbreviation.solvents
|
51
|
+
end
|
52
|
+
|
53
|
+
def self.all_abbreviations
|
54
|
+
@abbreviation.all
|
55
|
+
end
|
56
|
+
|
57
|
+
def self.get_abbreviation(abb)
|
58
|
+
if @superatom.get_superatom(abb).empty?
|
59
|
+
@abbreviation.get_abbreviation(abb)
|
60
|
+
else
|
61
|
+
""
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
def self.add_abbreviation(abb, smi)
|
66
|
+
@abbreviation.add(abb, smi)
|
67
|
+
end
|
68
|
+
|
69
|
+
def self.add_abbreviation_hash(hash)
|
70
|
+
@abbreviation.add_hash(hash)
|
71
|
+
end
|
72
|
+
|
73
|
+
def self.remove_abbreviation(abb)
|
74
|
+
@abbreviation.remove(abb)
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
Gem.find_files("chem_scanner/*.rb").each { |file| require file }
|
79
|
+
Gem.find_files("chem_scanner/export/*.rb").each { |file| require file }
|
@@ -0,0 +1,67 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Top module
|
4
|
+
module ChemScanner
|
5
|
+
require "chem_scanner/chem_draw/parser"
|
6
|
+
require "chem_scanner/chem_draw/cdx_reader"
|
7
|
+
|
8
|
+
# Class which traverse the tree in CDX binary files and parse
|
9
|
+
class Cdx < ChemDraw::Parser
|
10
|
+
attr_accessor :version
|
11
|
+
|
12
|
+
CREATIONPROGRAM = 0x0003
|
13
|
+
COLORTABLE = 0x0300
|
14
|
+
FONTTABLE = 0x0100
|
15
|
+
|
16
|
+
def initialize
|
17
|
+
super
|
18
|
+
|
19
|
+
@type = "cdx"
|
20
|
+
end
|
21
|
+
|
22
|
+
def read(file, is_path = true)
|
23
|
+
@reader = ChemDraw::CdxReader.new(file, is_path)
|
24
|
+
return false unless @reader.valid
|
25
|
+
|
26
|
+
read_global
|
27
|
+
read_objects until @reader.end?
|
28
|
+
rebuild_objects_map
|
29
|
+
|
30
|
+
@scheme = Interpreter::Scheme.new(self)
|
31
|
+
@scheme.interpret
|
32
|
+
|
33
|
+
@molecules = @scheme.molecules
|
34
|
+
@reactions = @scheme.reactions
|
35
|
+
|
36
|
+
true
|
37
|
+
end
|
38
|
+
|
39
|
+
def raw_data
|
40
|
+
@reader.bin
|
41
|
+
end
|
42
|
+
|
43
|
+
private
|
44
|
+
|
45
|
+
def read_global
|
46
|
+
tag = @reader.read_next until tag == CREATIONPROGRAM
|
47
|
+
@version = @reader.data.split(" ").last
|
48
|
+
|
49
|
+
tag = @reader.read_next until tag == COLORTABLE
|
50
|
+
@color_table = read_colortable(@reader.data, "cdx")
|
51
|
+
|
52
|
+
tag = @reader.read_next until tag == FONTTABLE
|
53
|
+
@font_table = read_fonttable(@reader.data, "cdx")
|
54
|
+
end
|
55
|
+
|
56
|
+
def read_objects
|
57
|
+
tag = @reader.read_next(true)
|
58
|
+
|
59
|
+
while tag.positive?
|
60
|
+
cid = @reader.current_id
|
61
|
+
parse_object(tag, cid)
|
62
|
+
|
63
|
+
tag = @reader.read_next(true)
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
@@ -0,0 +1,72 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Main module
|
4
|
+
module ChemScanner
|
5
|
+
require "nokogiri"
|
6
|
+
require "chem_scanner/chem_draw/parser"
|
7
|
+
|
8
|
+
# Read and Parse CDXML
|
9
|
+
class Cdxml < ChemDraw::Parser
|
10
|
+
attr_accessor :version, :reader
|
11
|
+
|
12
|
+
CDXML_DOCTYPE = "http://www.cambridgesoft.com/xml/cdxml.dtd"
|
13
|
+
|
14
|
+
def initialize
|
15
|
+
super
|
16
|
+
|
17
|
+
@type = "cdxml"
|
18
|
+
end
|
19
|
+
|
20
|
+
def read(file, is_path = true)
|
21
|
+
fs = is_path ? File.open(file) : file
|
22
|
+
@cdxml = Nokogiri::XML(fs)
|
23
|
+
return false if @cdxml.internal_subset.system_id != CDXML_DOCTYPE
|
24
|
+
|
25
|
+
read_global
|
26
|
+
|
27
|
+
@cdxml.xpath("//page").each do |page|
|
28
|
+
@reader = page
|
29
|
+
read_objects
|
30
|
+
end
|
31
|
+
|
32
|
+
rebuild_objects_map
|
33
|
+
|
34
|
+
@scheme = Interpreter::Scheme.new(self)
|
35
|
+
@scheme.interpret
|
36
|
+
|
37
|
+
@molecules = @scheme.molecules
|
38
|
+
@reactions = @scheme.reactions
|
39
|
+
|
40
|
+
true
|
41
|
+
end
|
42
|
+
|
43
|
+
def raw_data
|
44
|
+
@cdxml.to_xml
|
45
|
+
end
|
46
|
+
|
47
|
+
def read_global
|
48
|
+
@version = @cdxml.xpath("//CDXML/@CreationProgram").text.split(" ").last
|
49
|
+
|
50
|
+
ct = @cdxml.xpath("//CDXML/colortable").first
|
51
|
+
@color_table = read_colortable(ct, "cdxml")
|
52
|
+
|
53
|
+
ft = @cdxml.xpath("//CDXML/fonttable").first
|
54
|
+
@font_table = read_fonttable(ft, "cdxml")
|
55
|
+
end
|
56
|
+
|
57
|
+
def read_objects
|
58
|
+
nodes = @reader.element_children
|
59
|
+
|
60
|
+
nodes.each do |node|
|
61
|
+
@reader = node
|
62
|
+
nid = (node.attr("id") || 0).to_i
|
63
|
+
|
64
|
+
if ChemDraw::CDXML_OBJ[node.name] == "Group"
|
65
|
+
read_objects
|
66
|
+
else
|
67
|
+
parse_object(node.name, nid)
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|