chem_scanner 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +13 -0
- data/.rspec +3 -0
- data/.rubocop.yml +604 -0
- data/.ruby-gemset +1 -0
- data/.ruby-version +1 -0
- data/.travis.yml +5 -0
- data/CODE_OF_CONDUCT.md +74 -0
- data/Gemfile +20 -0
- data/LICENSE.txt +661 -0
- data/README.md +177 -0
- data/Rakefile +8 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/chem_scanner.gemspec +43 -0
- data/lib/chem_scanner.rb +79 -0
- data/lib/chem_scanner/cdx.rb +67 -0
- data/lib/chem_scanner/cdxml.rb +72 -0
- data/lib/chem_scanner/chem_draw/cdx_reader.rb +101 -0
- data/lib/chem_scanner/chem_draw/node/base_node.rb +123 -0
- data/lib/chem_scanner/chem_draw/node/base_value.rb +257 -0
- data/lib/chem_scanner/chem_draw/node/bond.rb +100 -0
- data/lib/chem_scanner/chem_draw/node/bracket_attachment.rb +17 -0
- data/lib/chem_scanner/chem_draw/node/bracket_group.rb +32 -0
- data/lib/chem_scanner/chem_draw/node/chem_geometry.rb +58 -0
- data/lib/chem_scanner/chem_draw/node/color_table.rb +46 -0
- data/lib/chem_scanner/chem_draw/node/font_table.rb +54 -0
- data/lib/chem_scanner/chem_draw/node/fragment.rb +149 -0
- data/lib/chem_scanner/chem_draw/node/fragment_node.rb +145 -0
- data/lib/chem_scanner/chem_draw/node/graphic.rb +94 -0
- data/lib/chem_scanner/chem_draw/node/text.rb +242 -0
- data/lib/chem_scanner/chem_draw/parser.rb +214 -0
- data/lib/chem_scanner/chem_draw/yaml/cdx_objects.yaml +32 -0
- data/lib/chem_scanner/chem_draw/yaml/cdx_props.yaml +263 -0
- data/lib/chem_scanner/chem_draw/yaml/cdxml_objects.yaml +36 -0
- data/lib/chem_scanner/chem_draw/yaml/cdxml_props.yaml +263 -0
- data/lib/chem_scanner/chem_draw/yaml/props_data_type.yaml +263 -0
- data/lib/chem_scanner/configuration/abbreviation.rb +76 -0
- data/lib/chem_scanner/configuration/superatom.rb +76 -0
- data/lib/chem_scanner/configuration/superatom.txt +2874 -0
- data/lib/chem_scanner/configuration/util.rb +40 -0
- data/lib/chem_scanner/configuration/yaml/abbreviations.yaml +6399 -0
- data/lib/chem_scanner/configuration/yaml/elements.yaml +115 -0
- data/lib/chem_scanner/configuration/yaml/solvents.yaml +16 -0
- data/lib/chem_scanner/doc.rb +56 -0
- data/lib/chem_scanner/docx.rb +86 -0
- data/lib/chem_scanner/export/cml.rb +176 -0
- data/lib/chem_scanner/extension/element_map.rb +9 -0
- data/lib/chem_scanner/extension/geometry/bounding_box.rb +84 -0
- data/lib/chem_scanner/extension/geometry/line.rb +123 -0
- data/lib/chem_scanner/extension/geometry/point.rb +18 -0
- data/lib/chem_scanner/extension/geometry/polygon.rb +115 -0
- data/lib/chem_scanner/extension/geometry/segment.rb +196 -0
- data/lib/chem_scanner/extension/passthrough.rb +7 -0
- data/lib/chem_scanner/interpreter/element/arrow.rb +298 -0
- data/lib/chem_scanner/interpreter/element/atom.rb +134 -0
- data/lib/chem_scanner/interpreter/element/fragment.rb +59 -0
- data/lib/chem_scanner/interpreter/element/molecule.rb +473 -0
- data/lib/chem_scanner/interpreter/element/molecule_group.rb +34 -0
- data/lib/chem_scanner/interpreter/element/reaction.rb +186 -0
- data/lib/chem_scanner/interpreter/element/reaction_step.rb +39 -0
- data/lib/chem_scanner/interpreter/formula_to_mol.rb +75 -0
- data/lib/chem_scanner/interpreter/post_process/assemble.rb +38 -0
- data/lib/chem_scanner/interpreter/post_process/label_by_molecule.rb +37 -0
- data/lib/chem_scanner/interpreter/post_process/reaction_info.rb +225 -0
- data/lib/chem_scanner/interpreter/post_process/reaction_step.rb +95 -0
- data/lib/chem_scanner/interpreter/post_process/reagent_label.rb +46 -0
- data/lib/chem_scanner/interpreter/post_process/text_as_molecule.rb +52 -0
- data/lib/chem_scanner/interpreter/post_process/text_label.rb +40 -0
- data/lib/chem_scanner/interpreter/pre_process/arrow.rb +197 -0
- data/lib/chem_scanner/interpreter/pre_process/graphic.rb +41 -0
- data/lib/chem_scanner/interpreter/pre_process/molecule.rb +150 -0
- data/lib/chem_scanner/interpreter/reaction_detection/assign_to_reaction.rb +129 -0
- data/lib/chem_scanner/interpreter/reaction_detection/duplicate_reagents.rb +50 -0
- data/lib/chem_scanner/interpreter/reaction_detection/molecule_group.rb +55 -0
- data/lib/chem_scanner/interpreter/reaction_detection/multi_line_chain_reaction.rb +85 -0
- data/lib/chem_scanner/interpreter/reaction_detection/remove_separated_mol.rb +115 -0
- data/lib/chem_scanner/interpreter/reaction_detection/text_assignment.rb +166 -0
- data/lib/chem_scanner/interpreter/scheme.rb +173 -0
- data/lib/chem_scanner/interpreter/scheme_base.rb +64 -0
- data/lib/chem_scanner/interpreter/text_group/bold_groups.rb +183 -0
- data/lib/chem_scanner/interpreter/text_group/molecule_text_group.rb +138 -0
- data/lib/chem_scanner/interpreter/text_group/reaction_text_groups.rb +221 -0
- data/lib/chem_scanner/interpreter/text_group/retrieve_alias_info.rb +41 -0
- data/lib/chem_scanner/interpreter/text_group/retrieve_n_atoms.rb +106 -0
- data/lib/chem_scanner/interpreter/text_group/text_group_interpreter.rb +92 -0
- data/lib/chem_scanner/perkin_eln.rb +287 -0
- data/lib/chem_scanner/version.rb +5 -0
- data/lib/rubygems_plugin.rb +5 -0
- metadata +244 -0
data/README.md
ADDED
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
|
|
2
|
+
# Introduction
|
|
3
|
+
|
|
4
|
+
The `ChemScanner` library attempts to extract and interpret reactions/molecules information from ChemDraw-related files format: CDX, CDXML, embedded CDX within DOC and DOCX, [Perkin Elmer ELN](http://www.perkinelmer.com/category/notebook).
|
|
5
|
+
|
|
6
|
+
# Installation
|
|
7
|
+
|
|
8
|
+
Add this line to your application's Gemfile:
|
|
9
|
+
|
|
10
|
+
```ruby
|
|
11
|
+
gem 'chem_scanner'
|
|
12
|
+
```
|
|
13
|
+
|
|
14
|
+
And then execute:
|
|
15
|
+
|
|
16
|
+
$ bundle
|
|
17
|
+
|
|
18
|
+
Or install it yourself as:
|
|
19
|
+
|
|
20
|
+
$ gem install chem_scanner
|
|
21
|
+
|
|
22
|
+
# UI for ChemScanner
|
|
23
|
+
You can try the `ChemScanner` at https://eln.chemotion.net/ or https://eln.chemotion.net/chemscanner. The UI is more user-friendly which some additional features:
|
|
24
|
+
|
|
25
|
+
- Export to Excel and CML.
|
|
26
|
+
- Preview of the original scheme.
|
|
27
|
+
- Import directly to [Chemotion ELN](https://eln.chemotion.net)
|
|
28
|
+
- Add comment for each extracted scheme. These comments would also appear in the export and Chemotion ELN imported molecules/reactions.
|
|
29
|
+
- ...
|
|
30
|
+
|
|
31
|
+
# Usage
|
|
32
|
+
|
|
33
|
+
To scan/extract a single CDX file
|
|
34
|
+
|
|
35
|
+
```ruby
|
|
36
|
+
require 'chem_scanner'
|
|
37
|
+
|
|
38
|
+
cdx = ChemScanner::Cdx.new
|
|
39
|
+
cdx.read('/path/to/cdx/file')
|
|
40
|
+
# Get array of scanned Canonical SMILES
|
|
41
|
+
cdx.molecules.map(&:get_cano_smiles)
|
|
42
|
+
# Get array of scanned Reactions in SMILES
|
|
43
|
+
cdx.reactions.map(&:reaction_smiles)
|
|
44
|
+
```
|
|
45
|
+
There are 5 classes correspond to 5 supported file formats: CDX, CDXML, DOC, DOCX, PerkinELN.
|
|
46
|
+
|
|
47
|
+
# API
|
|
48
|
+
|
|
49
|
+
## Molecule
|
|
50
|
+
|
|
51
|
+
- Access "scanned" molecules
|
|
52
|
+
|
|
53
|
+
```ruby
|
|
54
|
+
# Molecules - array of scanned molecules
|
|
55
|
+
cdx.molecules
|
|
56
|
+
# Get array of scanned Canonical SMILES
|
|
57
|
+
cdx.molecules.map(&:get_cano_smiles)
|
|
58
|
+
# Get one molecule
|
|
59
|
+
molecule = cdx.molecules.first
|
|
60
|
+
# Number of scanned molecules
|
|
61
|
+
cdx.molecules.count
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
- Molecule class:
|
|
65
|
+
|
|
66
|
+
```ruby
|
|
67
|
+
# Canonical SMILES
|
|
68
|
+
molecule.get_cano_smiles
|
|
69
|
+
# Molfile
|
|
70
|
+
molecule.get_mdl
|
|
71
|
+
# RDKIT RWMol (https://www.rdkit.org/docs/cppapi/classRDKit_1_1RWMol.html)
|
|
72
|
+
molecule.rw_mol
|
|
73
|
+
# Molecule label (bold text near molecule)
|
|
74
|
+
molecule.label
|
|
75
|
+
# Molecule text (molecule description)
|
|
76
|
+
molecule.text
|
|
77
|
+
# Molecule details (additional information from Perkin Elmer ELN)
|
|
78
|
+
molecule.details
|
|
79
|
+
```
|
|
80
|
+
We are using a [ruby-binding version](https://github.com/CamAnNguyen/rdkit_chem) of `RDKit` as a dependency of `ChemScanner`.
|
|
81
|
+
|
|
82
|
+
## Reaction
|
|
83
|
+
|
|
84
|
+
Reaction consist of 3 groups of molecules: `reactants`, `reagents` and `products`. Each group is and array of molecules, which each element is an object of `Molecule` class. In addition, some abbreviations belong to the reaction are represented by SMILES. Those could be access via `reagent_smiles`
|
|
85
|
+
|
|
86
|
+
```ruby
|
|
87
|
+
reaction = cdx.reactions.first
|
|
88
|
+
# Access extracted structure group
|
|
89
|
+
reactants = reaction.reactants
|
|
90
|
+
reagents = reaction.reagents
|
|
91
|
+
products = reaction.products
|
|
92
|
+
reagent_smiles = reaction.reagent_smiles
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
Further manipulation of each group would be similar to `Molecule` class.
|
|
96
|
+
|
|
97
|
+
- **Reaction properties**
|
|
98
|
+
|
|
99
|
+
Reaction itself has `description`, `yield`, `time`, `temperature` and `details` properties. All these properties are extracted from the ChemDraw scheme, excep `details` field are additional information from `PerkinELN`.
|
|
100
|
+
|
|
101
|
+
- **Reaction step**
|
|
102
|
+
|
|
103
|
+
Some multi-step reactions can also be recognized. If a reaction is a multi-step reaction, the "steps" could be accessed via:
|
|
104
|
+
|
|
105
|
+
```ruby
|
|
106
|
+
# Get first scanned reaction
|
|
107
|
+
reaction = cdx.reactions.first
|
|
108
|
+
# Access first step
|
|
109
|
+
step = reaction.steps.first
|
|
110
|
+
step.number # Should be 1
|
|
111
|
+
step.description
|
|
112
|
+
step.time
|
|
113
|
+
step.temperature
|
|
114
|
+
# List reagents SMILES
|
|
115
|
+
step.reagents
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
Each step has these following properties: `description`, `time`, `temperature`, and `reagents`
|
|
119
|
+
|
|
120
|
+
## Supported File Formats
|
|
121
|
+
|
|
122
|
+
CDX, CDXML, PerkinELN usage and API are described above. Their outputs are simple `molecules` and `reactions`.
|
|
123
|
+
|
|
124
|
+
DOC and DOCX classes are little bit different. Since DOC and DOCX file can contain more than 1 embedded ChemDraw schemes, which each embedded scheme is 1 CDX scheme.
|
|
125
|
+
`ChemScanner` attempts to extract all of them and put into one `Hash` map, called `cdx_map`.
|
|
126
|
+
|
|
127
|
+
```ruby
|
|
128
|
+
require 'chem_scanner'
|
|
129
|
+
|
|
130
|
+
doc = ChemScanner::Doc.new
|
|
131
|
+
doc.read('/path/to/doc/file')
|
|
132
|
+
doc.cdx_map.each do |key, cdx|
|
|
133
|
+
puts cdx.reactions.map(&:reaction_smiles)
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
# Access all molecules in all CDXs
|
|
137
|
+
doc.molecules.map(&:get_cano_smiles)
|
|
138
|
+
# Access all reactions in all CDXs
|
|
139
|
+
doc.reactions.map(&:get_cano_smiles)
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
DOCX is a bit different, `ChemScanner` can extract the CDX together with its preview image within the documents.
|
|
143
|
+
|
|
144
|
+
```ruby
|
|
145
|
+
require 'chem_scanner'
|
|
146
|
+
|
|
147
|
+
docx = ChemScanner::Docx.new
|
|
148
|
+
docx.read('/path/to/docx/file')
|
|
149
|
+
docx.cdx_map.each do |key, cdx_info|
|
|
150
|
+
# Get the CDX scheme
|
|
151
|
+
cdx = cdx_info[:cdx]
|
|
152
|
+
puts cdx.reactions.map(&:reaction_smiles)
|
|
153
|
+
|
|
154
|
+
# Preview images, used for ChemScanner UI
|
|
155
|
+
img_ext = cdx_info[:img_ext] # Could be '.png', '.emf'
|
|
156
|
+
img_b64 = cdx_info[:img_b64] # Base64 encoded of image
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
# Access all molecules in all CDXs
|
|
160
|
+
docx.molecules.map(&:get_cano_smiles)
|
|
161
|
+
# Access all reactions in all CDXs
|
|
162
|
+
docx.reactions.map(&:get_cano_smiles)
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
# Development
|
|
166
|
+
|
|
167
|
+
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
|
168
|
+
|
|
169
|
+
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
|
170
|
+
|
|
171
|
+
# Contributing
|
|
172
|
+
|
|
173
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/CamAnNguyen/chem_scanner. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [Contributor Covenant](http://contributor-covenant.org) code of conduct.
|
|
174
|
+
|
|
175
|
+
# License
|
|
176
|
+
|
|
177
|
+
The gem is available as open source under the terms of the [GNU AGPLv3 License](https://www.gnu.org/licenses/agpl-3.0.en.html).
|
data/Rakefile
ADDED
data/bin/console
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
|
|
3
|
+
require "bundler/setup"
|
|
4
|
+
require "chem_scanner"
|
|
5
|
+
|
|
6
|
+
# You can add fixtures and/or initialization code here to make experimenting
|
|
7
|
+
# with your gem easier. You can also use a different console, if you like.
|
|
8
|
+
|
|
9
|
+
# (If you use this, don't forget to add pry to your Gemfile!)
|
|
10
|
+
# require "pry"
|
|
11
|
+
# Pry.start
|
|
12
|
+
|
|
13
|
+
require "irb"
|
|
14
|
+
IRB.start(__FILE__)
|
data/bin/setup
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
lib = File.expand_path("lib", __dir__)
|
|
4
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
|
5
|
+
|
|
6
|
+
require "chem_scanner/version"
|
|
7
|
+
|
|
8
|
+
# rubocop:disable Metrics/BlockLength
|
|
9
|
+
Gem::Specification.new do |spec|
|
|
10
|
+
spec.name = "chem_scanner"
|
|
11
|
+
spec.version = ChemScanner::VERSION
|
|
12
|
+
spec.authors = ["an.nguyen"]
|
|
13
|
+
spec.email = ["an.nguyen@kit.edu"]
|
|
14
|
+
|
|
15
|
+
spec.summary = "Extraction of chemical information"
|
|
16
|
+
spec.description = "ChemScanner is a chemical utiliy to extract " \
|
|
17
|
+
"chemical information from various scientific formats"
|
|
18
|
+
spec.homepage = "https://chemotion.net"
|
|
19
|
+
spec.license = "MIT"
|
|
20
|
+
|
|
21
|
+
spec.files = Dir.chdir(File.expand_path(__dir__)) do
|
|
22
|
+
`git ls-files -z`.split("\x0").reject do |f|
|
|
23
|
+
f.match(%r{^(test|spec|features)/})
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
spec.bindir = "exe"
|
|
28
|
+
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
|
29
|
+
spec.require_paths = ["lib"]
|
|
30
|
+
|
|
31
|
+
spec.required_ruby_version = ">= 2.3"
|
|
32
|
+
|
|
33
|
+
spec.add_development_dependency "bundler", ">= 1.16"
|
|
34
|
+
spec.add_development_dependency "rake", ">= 10.0"
|
|
35
|
+
spec.add_development_dependency "rspec", ">= 3.0"
|
|
36
|
+
|
|
37
|
+
spec.add_dependency "chronic_duration", ">= 0.10"
|
|
38
|
+
spec.add_dependency "nokogiri", ">= 1.8"
|
|
39
|
+
spec.add_dependency "rdkit_chem"
|
|
40
|
+
spec.add_dependency "ruby-geometry", ">= 0.0.6"
|
|
41
|
+
spec.add_dependency "ruby-ole", ">= 1.2"
|
|
42
|
+
end
|
|
43
|
+
# rubocop:enable Metrics/BlockLength
|
data/lib/chem_scanner.rb
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
require "yaml"
|
|
2
|
+
require "rdkit_chem"
|
|
3
|
+
require "ostruct"
|
|
4
|
+
require "forwardable"
|
|
5
|
+
|
|
6
|
+
# ChemScanner main module
|
|
7
|
+
module ChemScanner
|
|
8
|
+
(
|
|
9
|
+
Gem.find_files("chem_scanner/extension/*/*.rb") +
|
|
10
|
+
Gem.find_files("chem_scanner/extension/*.rb") +
|
|
11
|
+
Gem.find_files("chem_scanner/configuration/*.rb")
|
|
12
|
+
).each { |file| require file }
|
|
13
|
+
|
|
14
|
+
@superatom = Superatom.instance
|
|
15
|
+
@abbreviation = Abbreviation.instance
|
|
16
|
+
|
|
17
|
+
def self.sync_custom_superatom
|
|
18
|
+
@superatom.sync_custom
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def self.all_superatoms
|
|
22
|
+
@superatom.all
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def self.predefined_superatoms
|
|
26
|
+
@superatom.predefined
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def self.custom_superatoms
|
|
30
|
+
@superatom.custom
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def self.get_superatom(superatom)
|
|
34
|
+
@superatom.get_superatom(superatom)
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def self.add_superatom(satom, smi)
|
|
38
|
+
@superatom.add(satom, smi)
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def self.remove_superatom(satom)
|
|
42
|
+
@superatom.remove(satom)
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def self.predefined_abbreviations
|
|
46
|
+
@abbreviation.predefined
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
def self.solvents
|
|
50
|
+
@abbreviation.solvents
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
def self.all_abbreviations
|
|
54
|
+
@abbreviation.all
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def self.get_abbreviation(abb)
|
|
58
|
+
if @superatom.get_superatom(abb).empty?
|
|
59
|
+
@abbreviation.get_abbreviation(abb)
|
|
60
|
+
else
|
|
61
|
+
""
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
def self.add_abbreviation(abb, smi)
|
|
66
|
+
@abbreviation.add(abb, smi)
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
def self.add_abbreviation_hash(hash)
|
|
70
|
+
@abbreviation.add_hash(hash)
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
def self.remove_abbreviation(abb)
|
|
74
|
+
@abbreviation.remove(abb)
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
Gem.find_files("chem_scanner/*.rb").each { |file| require file }
|
|
79
|
+
Gem.find_files("chem_scanner/export/*.rb").each { |file| require file }
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# Top module
|
|
4
|
+
module ChemScanner
|
|
5
|
+
require "chem_scanner/chem_draw/parser"
|
|
6
|
+
require "chem_scanner/chem_draw/cdx_reader"
|
|
7
|
+
|
|
8
|
+
# Class which traverse the tree in CDX binary files and parse
|
|
9
|
+
class Cdx < ChemDraw::Parser
|
|
10
|
+
attr_accessor :version
|
|
11
|
+
|
|
12
|
+
CREATIONPROGRAM = 0x0003
|
|
13
|
+
COLORTABLE = 0x0300
|
|
14
|
+
FONTTABLE = 0x0100
|
|
15
|
+
|
|
16
|
+
def initialize
|
|
17
|
+
super
|
|
18
|
+
|
|
19
|
+
@type = "cdx"
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def read(file, is_path = true)
|
|
23
|
+
@reader = ChemDraw::CdxReader.new(file, is_path)
|
|
24
|
+
return false unless @reader.valid
|
|
25
|
+
|
|
26
|
+
read_global
|
|
27
|
+
read_objects until @reader.end?
|
|
28
|
+
rebuild_objects_map
|
|
29
|
+
|
|
30
|
+
@scheme = Interpreter::Scheme.new(self)
|
|
31
|
+
@scheme.interpret
|
|
32
|
+
|
|
33
|
+
@molecules = @scheme.molecules
|
|
34
|
+
@reactions = @scheme.reactions
|
|
35
|
+
|
|
36
|
+
true
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def raw_data
|
|
40
|
+
@reader.bin
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
private
|
|
44
|
+
|
|
45
|
+
def read_global
|
|
46
|
+
tag = @reader.read_next until tag == CREATIONPROGRAM
|
|
47
|
+
@version = @reader.data.split(" ").last
|
|
48
|
+
|
|
49
|
+
tag = @reader.read_next until tag == COLORTABLE
|
|
50
|
+
@color_table = read_colortable(@reader.data, "cdx")
|
|
51
|
+
|
|
52
|
+
tag = @reader.read_next until tag == FONTTABLE
|
|
53
|
+
@font_table = read_fonttable(@reader.data, "cdx")
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def read_objects
|
|
57
|
+
tag = @reader.read_next(true)
|
|
58
|
+
|
|
59
|
+
while tag.positive?
|
|
60
|
+
cid = @reader.current_id
|
|
61
|
+
parse_object(tag, cid)
|
|
62
|
+
|
|
63
|
+
tag = @reader.read_next(true)
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
end
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# Main module
|
|
4
|
+
module ChemScanner
|
|
5
|
+
require "nokogiri"
|
|
6
|
+
require "chem_scanner/chem_draw/parser"
|
|
7
|
+
|
|
8
|
+
# Read and Parse CDXML
|
|
9
|
+
class Cdxml < ChemDraw::Parser
|
|
10
|
+
attr_accessor :version, :reader
|
|
11
|
+
|
|
12
|
+
CDXML_DOCTYPE = "http://www.cambridgesoft.com/xml/cdxml.dtd"
|
|
13
|
+
|
|
14
|
+
def initialize
|
|
15
|
+
super
|
|
16
|
+
|
|
17
|
+
@type = "cdxml"
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def read(file, is_path = true)
|
|
21
|
+
fs = is_path ? File.open(file) : file
|
|
22
|
+
@cdxml = Nokogiri::XML(fs)
|
|
23
|
+
return false if @cdxml.internal_subset.system_id != CDXML_DOCTYPE
|
|
24
|
+
|
|
25
|
+
read_global
|
|
26
|
+
|
|
27
|
+
@cdxml.xpath("//page").each do |page|
|
|
28
|
+
@reader = page
|
|
29
|
+
read_objects
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
rebuild_objects_map
|
|
33
|
+
|
|
34
|
+
@scheme = Interpreter::Scheme.new(self)
|
|
35
|
+
@scheme.interpret
|
|
36
|
+
|
|
37
|
+
@molecules = @scheme.molecules
|
|
38
|
+
@reactions = @scheme.reactions
|
|
39
|
+
|
|
40
|
+
true
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def raw_data
|
|
44
|
+
@cdxml.to_xml
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def read_global
|
|
48
|
+
@version = @cdxml.xpath("//CDXML/@CreationProgram").text.split(" ").last
|
|
49
|
+
|
|
50
|
+
ct = @cdxml.xpath("//CDXML/colortable").first
|
|
51
|
+
@color_table = read_colortable(ct, "cdxml")
|
|
52
|
+
|
|
53
|
+
ft = @cdxml.xpath("//CDXML/fonttable").first
|
|
54
|
+
@font_table = read_fonttable(ft, "cdxml")
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def read_objects
|
|
58
|
+
nodes = @reader.element_children
|
|
59
|
+
|
|
60
|
+
nodes.each do |node|
|
|
61
|
+
@reader = node
|
|
62
|
+
nid = (node.attr("id") || 0).to_i
|
|
63
|
+
|
|
64
|
+
if ChemDraw::CDXML_OBJ[node.name] == "Group"
|
|
65
|
+
read_objects
|
|
66
|
+
else
|
|
67
|
+
parse_object(node.name, nid)
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
end
|