chem_scanner 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +13 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yml +604 -0
  5. data/.ruby-gemset +1 -0
  6. data/.ruby-version +1 -0
  7. data/.travis.yml +5 -0
  8. data/CODE_OF_CONDUCT.md +74 -0
  9. data/Gemfile +20 -0
  10. data/LICENSE.txt +661 -0
  11. data/README.md +177 -0
  12. data/Rakefile +8 -0
  13. data/bin/console +14 -0
  14. data/bin/setup +8 -0
  15. data/chem_scanner.gemspec +43 -0
  16. data/lib/chem_scanner.rb +79 -0
  17. data/lib/chem_scanner/cdx.rb +67 -0
  18. data/lib/chem_scanner/cdxml.rb +72 -0
  19. data/lib/chem_scanner/chem_draw/cdx_reader.rb +101 -0
  20. data/lib/chem_scanner/chem_draw/node/base_node.rb +123 -0
  21. data/lib/chem_scanner/chem_draw/node/base_value.rb +257 -0
  22. data/lib/chem_scanner/chem_draw/node/bond.rb +100 -0
  23. data/lib/chem_scanner/chem_draw/node/bracket_attachment.rb +17 -0
  24. data/lib/chem_scanner/chem_draw/node/bracket_group.rb +32 -0
  25. data/lib/chem_scanner/chem_draw/node/chem_geometry.rb +58 -0
  26. data/lib/chem_scanner/chem_draw/node/color_table.rb +46 -0
  27. data/lib/chem_scanner/chem_draw/node/font_table.rb +54 -0
  28. data/lib/chem_scanner/chem_draw/node/fragment.rb +149 -0
  29. data/lib/chem_scanner/chem_draw/node/fragment_node.rb +145 -0
  30. data/lib/chem_scanner/chem_draw/node/graphic.rb +94 -0
  31. data/lib/chem_scanner/chem_draw/node/text.rb +242 -0
  32. data/lib/chem_scanner/chem_draw/parser.rb +214 -0
  33. data/lib/chem_scanner/chem_draw/yaml/cdx_objects.yaml +32 -0
  34. data/lib/chem_scanner/chem_draw/yaml/cdx_props.yaml +263 -0
  35. data/lib/chem_scanner/chem_draw/yaml/cdxml_objects.yaml +36 -0
  36. data/lib/chem_scanner/chem_draw/yaml/cdxml_props.yaml +263 -0
  37. data/lib/chem_scanner/chem_draw/yaml/props_data_type.yaml +263 -0
  38. data/lib/chem_scanner/configuration/abbreviation.rb +76 -0
  39. data/lib/chem_scanner/configuration/superatom.rb +76 -0
  40. data/lib/chem_scanner/configuration/superatom.txt +2874 -0
  41. data/lib/chem_scanner/configuration/util.rb +40 -0
  42. data/lib/chem_scanner/configuration/yaml/abbreviations.yaml +6399 -0
  43. data/lib/chem_scanner/configuration/yaml/elements.yaml +115 -0
  44. data/lib/chem_scanner/configuration/yaml/solvents.yaml +16 -0
  45. data/lib/chem_scanner/doc.rb +56 -0
  46. data/lib/chem_scanner/docx.rb +86 -0
  47. data/lib/chem_scanner/export/cml.rb +176 -0
  48. data/lib/chem_scanner/extension/element_map.rb +9 -0
  49. data/lib/chem_scanner/extension/geometry/bounding_box.rb +84 -0
  50. data/lib/chem_scanner/extension/geometry/line.rb +123 -0
  51. data/lib/chem_scanner/extension/geometry/point.rb +18 -0
  52. data/lib/chem_scanner/extension/geometry/polygon.rb +115 -0
  53. data/lib/chem_scanner/extension/geometry/segment.rb +196 -0
  54. data/lib/chem_scanner/extension/passthrough.rb +7 -0
  55. data/lib/chem_scanner/interpreter/element/arrow.rb +298 -0
  56. data/lib/chem_scanner/interpreter/element/atom.rb +134 -0
  57. data/lib/chem_scanner/interpreter/element/fragment.rb +59 -0
  58. data/lib/chem_scanner/interpreter/element/molecule.rb +473 -0
  59. data/lib/chem_scanner/interpreter/element/molecule_group.rb +34 -0
  60. data/lib/chem_scanner/interpreter/element/reaction.rb +186 -0
  61. data/lib/chem_scanner/interpreter/element/reaction_step.rb +39 -0
  62. data/lib/chem_scanner/interpreter/formula_to_mol.rb +75 -0
  63. data/lib/chem_scanner/interpreter/post_process/assemble.rb +38 -0
  64. data/lib/chem_scanner/interpreter/post_process/label_by_molecule.rb +37 -0
  65. data/lib/chem_scanner/interpreter/post_process/reaction_info.rb +225 -0
  66. data/lib/chem_scanner/interpreter/post_process/reaction_step.rb +95 -0
  67. data/lib/chem_scanner/interpreter/post_process/reagent_label.rb +46 -0
  68. data/lib/chem_scanner/interpreter/post_process/text_as_molecule.rb +52 -0
  69. data/lib/chem_scanner/interpreter/post_process/text_label.rb +40 -0
  70. data/lib/chem_scanner/interpreter/pre_process/arrow.rb +197 -0
  71. data/lib/chem_scanner/interpreter/pre_process/graphic.rb +41 -0
  72. data/lib/chem_scanner/interpreter/pre_process/molecule.rb +150 -0
  73. data/lib/chem_scanner/interpreter/reaction_detection/assign_to_reaction.rb +129 -0
  74. data/lib/chem_scanner/interpreter/reaction_detection/duplicate_reagents.rb +50 -0
  75. data/lib/chem_scanner/interpreter/reaction_detection/molecule_group.rb +55 -0
  76. data/lib/chem_scanner/interpreter/reaction_detection/multi_line_chain_reaction.rb +85 -0
  77. data/lib/chem_scanner/interpreter/reaction_detection/remove_separated_mol.rb +115 -0
  78. data/lib/chem_scanner/interpreter/reaction_detection/text_assignment.rb +166 -0
  79. data/lib/chem_scanner/interpreter/scheme.rb +173 -0
  80. data/lib/chem_scanner/interpreter/scheme_base.rb +64 -0
  81. data/lib/chem_scanner/interpreter/text_group/bold_groups.rb +183 -0
  82. data/lib/chem_scanner/interpreter/text_group/molecule_text_group.rb +138 -0
  83. data/lib/chem_scanner/interpreter/text_group/reaction_text_groups.rb +221 -0
  84. data/lib/chem_scanner/interpreter/text_group/retrieve_alias_info.rb +41 -0
  85. data/lib/chem_scanner/interpreter/text_group/retrieve_n_atoms.rb +106 -0
  86. data/lib/chem_scanner/interpreter/text_group/text_group_interpreter.rb +92 -0
  87. data/lib/chem_scanner/perkin_eln.rb +287 -0
  88. data/lib/chem_scanner/version.rb +5 -0
  89. data/lib/rubygems_plugin.rb +5 -0
  90. metadata +244 -0
data/README.md ADDED
@@ -0,0 +1,177 @@
1
+
2
+ # Introduction
3
+
4
+ The `ChemScanner` library attempts to extract and interpret reactions/molecules information from ChemDraw-related files format: CDX, CDXML, embedded CDX within DOC and DOCX, [Perkin Elmer ELN](http://www.perkinelmer.com/category/notebook).
5
+
6
+ # Installation
7
+
8
+ Add this line to your application's Gemfile:
9
+
10
+ ```ruby
11
+ gem 'chem_scanner'
12
+ ```
13
+
14
+ And then execute:
15
+
16
+ $ bundle
17
+
18
+ Or install it yourself as:
19
+
20
+ $ gem install chem_scanner
21
+
22
+ # UI for ChemScanner
23
+ You can try the `ChemScanner` at https://eln.chemotion.net/ or https://eln.chemotion.net/chemscanner. The UI is more user-friendly which some additional features:
24
+
25
+ - Export to Excel and CML.
26
+ - Preview of the original scheme.
27
+ - Import directly to [Chemotion ELN](https://eln.chemotion.net)
28
+ - Add comment for each extracted scheme. These comments would also appear in the export and Chemotion ELN imported molecules/reactions.
29
+ - ...
30
+
31
+ # Usage
32
+
33
+ To scan/extract a single CDX file
34
+
35
+ ```ruby
36
+ require 'chem_scanner'
37
+
38
+ cdx = ChemScanner::Cdx.new
39
+ cdx.read('/path/to/cdx/file')
40
+ # Get array of scanned Canonical SMILES
41
+ cdx.molecules.map(&:get_cano_smiles)
42
+ # Get array of scanned Reactions in SMILES
43
+ cdx.reactions.map(&:reaction_smiles)
44
+ ```
45
+ There are 5 classes correspond to 5 supported file formats: CDX, CDXML, DOC, DOCX, PerkinELN.
46
+
47
+ # API
48
+
49
+ ## Molecule
50
+
51
+ - Access "scanned" molecules
52
+
53
+ ```ruby
54
+ # Molecules - array of scanned molecules
55
+ cdx.molecules
56
+ # Get array of scanned Canonical SMILES
57
+ cdx.molecules.map(&:get_cano_smiles)
58
+ # Get one molecule
59
+ molecule = cdx.molecules.first
60
+ # Number of scanned molecules
61
+ cdx.molecules.count
62
+ ```
63
+
64
+ - Molecule class:
65
+
66
+ ```ruby
67
+ # Canonical SMILES
68
+ molecule.get_cano_smiles
69
+ # Molfile
70
+ molecule.get_mdl
71
+ # RDKIT RWMol (https://www.rdkit.org/docs/cppapi/classRDKit_1_1RWMol.html)
72
+ molecule.rw_mol
73
+ # Molecule label (bold text near molecule)
74
+ molecule.label
75
+ # Molecule text (molecule description)
76
+ molecule.text
77
+ # Molecule details (additional information from Perkin Elmer ELN)
78
+ molecule.details
79
+ ```
80
+ We are using a [ruby-binding version](https://github.com/CamAnNguyen/rdkit_chem) of `RDKit` as a dependency of `ChemScanner`.
81
+
82
+ ## Reaction
83
+
84
+ Reaction consist of 3 groups of molecules: `reactants`, `reagents` and `products`. Each group is and array of molecules, which each element is an object of `Molecule` class. In addition, some abbreviations belong to the reaction are represented by SMILES. Those could be access via `reagent_smiles`
85
+
86
+ ```ruby
87
+ reaction = cdx.reactions.first
88
+ # Access extracted structure group
89
+ reactants = reaction.reactants
90
+ reagents = reaction.reagents
91
+ products = reaction.products
92
+ reagent_smiles = reaction.reagent_smiles
93
+ ```
94
+
95
+ Further manipulation of each group would be similar to `Molecule` class.
96
+
97
+ - **Reaction properties**
98
+
99
+ Reaction itself has `description`, `yield`, `time`, `temperature` and `details` properties. All these properties are extracted from the ChemDraw scheme, excep `details` field are additional information from `PerkinELN`.
100
+
101
+ - **Reaction step**
102
+
103
+ Some multi-step reactions can also be recognized. If a reaction is a multi-step reaction, the "steps" could be accessed via:
104
+
105
+ ```ruby
106
+ # Get first scanned reaction
107
+ reaction = cdx.reactions.first
108
+ # Access first step
109
+ step = reaction.steps.first
110
+ step.number # Should be 1
111
+ step.description
112
+ step.time
113
+ step.temperature
114
+ # List reagents SMILES
115
+ step.reagents
116
+ ```
117
+
118
+ Each step has these following properties: `description`, `time`, `temperature`, and `reagents`
119
+
120
+ ## Supported File Formats
121
+
122
+ CDX, CDXML, PerkinELN usage and API are described above. Their outputs are simple `molecules` and `reactions`.
123
+
124
+ DOC and DOCX classes are little bit different. Since DOC and DOCX file can contain more than 1 embedded ChemDraw schemes, which each embedded scheme is 1 CDX scheme.
125
+ `ChemScanner` attempts to extract all of them and put into one `Hash` map, called `cdx_map`.
126
+
127
+ ```ruby
128
+ require 'chem_scanner'
129
+
130
+ doc = ChemScanner::Doc.new
131
+ doc.read('/path/to/doc/file')
132
+ doc.cdx_map.each do |key, cdx|
133
+ puts cdx.reactions.map(&:reaction_smiles)
134
+ end
135
+
136
+ # Access all molecules in all CDXs
137
+ doc.molecules.map(&:get_cano_smiles)
138
+ # Access all reactions in all CDXs
139
+ doc.reactions.map(&:get_cano_smiles)
140
+ ```
141
+
142
+ DOCX is a bit different, `ChemScanner` can extract the CDX together with its preview image within the documents.
143
+
144
+ ```ruby
145
+ require 'chem_scanner'
146
+
147
+ docx = ChemScanner::Docx.new
148
+ docx.read('/path/to/docx/file')
149
+ docx.cdx_map.each do |key, cdx_info|
150
+ # Get the CDX scheme
151
+ cdx = cdx_info[:cdx]
152
+ puts cdx.reactions.map(&:reaction_smiles)
153
+
154
+ # Preview images, used for ChemScanner UI
155
+ img_ext = cdx_info[:img_ext] # Could be '.png', '.emf'
156
+ img_b64 = cdx_info[:img_b64] # Base64 encoded of image
157
+ end
158
+
159
+ # Access all molecules in all CDXs
160
+ docx.molecules.map(&:get_cano_smiles)
161
+ # Access all reactions in all CDXs
162
+ docx.reactions.map(&:get_cano_smiles)
163
+ ```
164
+
165
+ # Development
166
+
167
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
168
+
169
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
170
+
171
+ # Contributing
172
+
173
+ Bug reports and pull requests are welcome on GitHub at https://github.com/CamAnNguyen/chem_scanner. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [Contributor Covenant](http://contributor-covenant.org) code of conduct.
174
+
175
+ # License
176
+
177
+ The gem is available as open source under the terms of the [GNU AGPLv3 License](https://www.gnu.org/licenses/agpl-3.0.en.html).
data/Rakefile ADDED
@@ -0,0 +1,8 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "bundler/gem_tasks"
4
+ require "rspec/core/rake_task"
5
+
6
+ RSpec::Core::RakeTask.new(:spec)
7
+
8
+ task default: :spec
data/bin/console ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "chem_scanner"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start(__FILE__)
data/bin/setup ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,43 @@
1
+ # frozen_string_literal: true
2
+
3
+ lib = File.expand_path("lib", __dir__)
4
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
5
+
6
+ require "chem_scanner/version"
7
+
8
+ # rubocop:disable Metrics/BlockLength
9
+ Gem::Specification.new do |spec|
10
+ spec.name = "chem_scanner"
11
+ spec.version = ChemScanner::VERSION
12
+ spec.authors = ["an.nguyen"]
13
+ spec.email = ["an.nguyen@kit.edu"]
14
+
15
+ spec.summary = "Extraction of chemical information"
16
+ spec.description = "ChemScanner is a chemical utiliy to extract " \
17
+ "chemical information from various scientific formats"
18
+ spec.homepage = "https://chemotion.net"
19
+ spec.license = "MIT"
20
+
21
+ spec.files = Dir.chdir(File.expand_path(__dir__)) do
22
+ `git ls-files -z`.split("\x0").reject do |f|
23
+ f.match(%r{^(test|spec|features)/})
24
+ end
25
+ end
26
+
27
+ spec.bindir = "exe"
28
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
29
+ spec.require_paths = ["lib"]
30
+
31
+ spec.required_ruby_version = ">= 2.3"
32
+
33
+ spec.add_development_dependency "bundler", ">= 1.16"
34
+ spec.add_development_dependency "rake", ">= 10.0"
35
+ spec.add_development_dependency "rspec", ">= 3.0"
36
+
37
+ spec.add_dependency "chronic_duration", ">= 0.10"
38
+ spec.add_dependency "nokogiri", ">= 1.8"
39
+ spec.add_dependency "rdkit_chem"
40
+ spec.add_dependency "ruby-geometry", ">= 0.0.6"
41
+ spec.add_dependency "ruby-ole", ">= 1.2"
42
+ end
43
+ # rubocop:enable Metrics/BlockLength
@@ -0,0 +1,79 @@
1
+ require "yaml"
2
+ require "rdkit_chem"
3
+ require "ostruct"
4
+ require "forwardable"
5
+
6
+ # ChemScanner main module
7
+ module ChemScanner
8
+ (
9
+ Gem.find_files("chem_scanner/extension/*/*.rb") +
10
+ Gem.find_files("chem_scanner/extension/*.rb") +
11
+ Gem.find_files("chem_scanner/configuration/*.rb")
12
+ ).each { |file| require file }
13
+
14
+ @superatom = Superatom.instance
15
+ @abbreviation = Abbreviation.instance
16
+
17
+ def self.sync_custom_superatom
18
+ @superatom.sync_custom
19
+ end
20
+
21
+ def self.all_superatoms
22
+ @superatom.all
23
+ end
24
+
25
+ def self.predefined_superatoms
26
+ @superatom.predefined
27
+ end
28
+
29
+ def self.custom_superatoms
30
+ @superatom.custom
31
+ end
32
+
33
+ def self.get_superatom(superatom)
34
+ @superatom.get_superatom(superatom)
35
+ end
36
+
37
+ def self.add_superatom(satom, smi)
38
+ @superatom.add(satom, smi)
39
+ end
40
+
41
+ def self.remove_superatom(satom)
42
+ @superatom.remove(satom)
43
+ end
44
+
45
+ def self.predefined_abbreviations
46
+ @abbreviation.predefined
47
+ end
48
+
49
+ def self.solvents
50
+ @abbreviation.solvents
51
+ end
52
+
53
+ def self.all_abbreviations
54
+ @abbreviation.all
55
+ end
56
+
57
+ def self.get_abbreviation(abb)
58
+ if @superatom.get_superatom(abb).empty?
59
+ @abbreviation.get_abbreviation(abb)
60
+ else
61
+ ""
62
+ end
63
+ end
64
+
65
+ def self.add_abbreviation(abb, smi)
66
+ @abbreviation.add(abb, smi)
67
+ end
68
+
69
+ def self.add_abbreviation_hash(hash)
70
+ @abbreviation.add_hash(hash)
71
+ end
72
+
73
+ def self.remove_abbreviation(abb)
74
+ @abbreviation.remove(abb)
75
+ end
76
+ end
77
+
78
+ Gem.find_files("chem_scanner/*.rb").each { |file| require file }
79
+ Gem.find_files("chem_scanner/export/*.rb").each { |file| require file }
@@ -0,0 +1,67 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Top module
4
+ module ChemScanner
5
+ require "chem_scanner/chem_draw/parser"
6
+ require "chem_scanner/chem_draw/cdx_reader"
7
+
8
+ # Class which traverse the tree in CDX binary files and parse
9
+ class Cdx < ChemDraw::Parser
10
+ attr_accessor :version
11
+
12
+ CREATIONPROGRAM = 0x0003
13
+ COLORTABLE = 0x0300
14
+ FONTTABLE = 0x0100
15
+
16
+ def initialize
17
+ super
18
+
19
+ @type = "cdx"
20
+ end
21
+
22
+ def read(file, is_path = true)
23
+ @reader = ChemDraw::CdxReader.new(file, is_path)
24
+ return false unless @reader.valid
25
+
26
+ read_global
27
+ read_objects until @reader.end?
28
+ rebuild_objects_map
29
+
30
+ @scheme = Interpreter::Scheme.new(self)
31
+ @scheme.interpret
32
+
33
+ @molecules = @scheme.molecules
34
+ @reactions = @scheme.reactions
35
+
36
+ true
37
+ end
38
+
39
+ def raw_data
40
+ @reader.bin
41
+ end
42
+
43
+ private
44
+
45
+ def read_global
46
+ tag = @reader.read_next until tag == CREATIONPROGRAM
47
+ @version = @reader.data.split(" ").last
48
+
49
+ tag = @reader.read_next until tag == COLORTABLE
50
+ @color_table = read_colortable(@reader.data, "cdx")
51
+
52
+ tag = @reader.read_next until tag == FONTTABLE
53
+ @font_table = read_fonttable(@reader.data, "cdx")
54
+ end
55
+
56
+ def read_objects
57
+ tag = @reader.read_next(true)
58
+
59
+ while tag.positive?
60
+ cid = @reader.current_id
61
+ parse_object(tag, cid)
62
+
63
+ tag = @reader.read_next(true)
64
+ end
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,72 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Main module
4
+ module ChemScanner
5
+ require "nokogiri"
6
+ require "chem_scanner/chem_draw/parser"
7
+
8
+ # Read and Parse CDXML
9
+ class Cdxml < ChemDraw::Parser
10
+ attr_accessor :version, :reader
11
+
12
+ CDXML_DOCTYPE = "http://www.cambridgesoft.com/xml/cdxml.dtd"
13
+
14
+ def initialize
15
+ super
16
+
17
+ @type = "cdxml"
18
+ end
19
+
20
+ def read(file, is_path = true)
21
+ fs = is_path ? File.open(file) : file
22
+ @cdxml = Nokogiri::XML(fs)
23
+ return false if @cdxml.internal_subset.system_id != CDXML_DOCTYPE
24
+
25
+ read_global
26
+
27
+ @cdxml.xpath("//page").each do |page|
28
+ @reader = page
29
+ read_objects
30
+ end
31
+
32
+ rebuild_objects_map
33
+
34
+ @scheme = Interpreter::Scheme.new(self)
35
+ @scheme.interpret
36
+
37
+ @molecules = @scheme.molecules
38
+ @reactions = @scheme.reactions
39
+
40
+ true
41
+ end
42
+
43
+ def raw_data
44
+ @cdxml.to_xml
45
+ end
46
+
47
+ def read_global
48
+ @version = @cdxml.xpath("//CDXML/@CreationProgram").text.split(" ").last
49
+
50
+ ct = @cdxml.xpath("//CDXML/colortable").first
51
+ @color_table = read_colortable(ct, "cdxml")
52
+
53
+ ft = @cdxml.xpath("//CDXML/fonttable").first
54
+ @font_table = read_fonttable(ft, "cdxml")
55
+ end
56
+
57
+ def read_objects
58
+ nodes = @reader.element_children
59
+
60
+ nodes.each do |node|
61
+ @reader = node
62
+ nid = (node.attr("id") || 0).to_i
63
+
64
+ if ChemDraw::CDXML_OBJ[node.name] == "Group"
65
+ read_objects
66
+ else
67
+ parse_object(node.name, nid)
68
+ end
69
+ end
70
+ end
71
+ end
72
+ end