chem_scanner 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (90) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +13 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yml +604 -0
  5. data/.ruby-gemset +1 -0
  6. data/.ruby-version +1 -0
  7. data/.travis.yml +5 -0
  8. data/CODE_OF_CONDUCT.md +74 -0
  9. data/Gemfile +20 -0
  10. data/LICENSE.txt +661 -0
  11. data/README.md +177 -0
  12. data/Rakefile +8 -0
  13. data/bin/console +14 -0
  14. data/bin/setup +8 -0
  15. data/chem_scanner.gemspec +43 -0
  16. data/lib/chem_scanner.rb +79 -0
  17. data/lib/chem_scanner/cdx.rb +67 -0
  18. data/lib/chem_scanner/cdxml.rb +72 -0
  19. data/lib/chem_scanner/chem_draw/cdx_reader.rb +101 -0
  20. data/lib/chem_scanner/chem_draw/node/base_node.rb +123 -0
  21. data/lib/chem_scanner/chem_draw/node/base_value.rb +257 -0
  22. data/lib/chem_scanner/chem_draw/node/bond.rb +100 -0
  23. data/lib/chem_scanner/chem_draw/node/bracket_attachment.rb +17 -0
  24. data/lib/chem_scanner/chem_draw/node/bracket_group.rb +32 -0
  25. data/lib/chem_scanner/chem_draw/node/chem_geometry.rb +58 -0
  26. data/lib/chem_scanner/chem_draw/node/color_table.rb +46 -0
  27. data/lib/chem_scanner/chem_draw/node/font_table.rb +54 -0
  28. data/lib/chem_scanner/chem_draw/node/fragment.rb +149 -0
  29. data/lib/chem_scanner/chem_draw/node/fragment_node.rb +145 -0
  30. data/lib/chem_scanner/chem_draw/node/graphic.rb +94 -0
  31. data/lib/chem_scanner/chem_draw/node/text.rb +242 -0
  32. data/lib/chem_scanner/chem_draw/parser.rb +214 -0
  33. data/lib/chem_scanner/chem_draw/yaml/cdx_objects.yaml +32 -0
  34. data/lib/chem_scanner/chem_draw/yaml/cdx_props.yaml +263 -0
  35. data/lib/chem_scanner/chem_draw/yaml/cdxml_objects.yaml +36 -0
  36. data/lib/chem_scanner/chem_draw/yaml/cdxml_props.yaml +263 -0
  37. data/lib/chem_scanner/chem_draw/yaml/props_data_type.yaml +263 -0
  38. data/lib/chem_scanner/configuration/abbreviation.rb +76 -0
  39. data/lib/chem_scanner/configuration/superatom.rb +76 -0
  40. data/lib/chem_scanner/configuration/superatom.txt +2874 -0
  41. data/lib/chem_scanner/configuration/util.rb +40 -0
  42. data/lib/chem_scanner/configuration/yaml/abbreviations.yaml +6399 -0
  43. data/lib/chem_scanner/configuration/yaml/elements.yaml +115 -0
  44. data/lib/chem_scanner/configuration/yaml/solvents.yaml +16 -0
  45. data/lib/chem_scanner/doc.rb +56 -0
  46. data/lib/chem_scanner/docx.rb +86 -0
  47. data/lib/chem_scanner/export/cml.rb +176 -0
  48. data/lib/chem_scanner/extension/element_map.rb +9 -0
  49. data/lib/chem_scanner/extension/geometry/bounding_box.rb +84 -0
  50. data/lib/chem_scanner/extension/geometry/line.rb +123 -0
  51. data/lib/chem_scanner/extension/geometry/point.rb +18 -0
  52. data/lib/chem_scanner/extension/geometry/polygon.rb +115 -0
  53. data/lib/chem_scanner/extension/geometry/segment.rb +196 -0
  54. data/lib/chem_scanner/extension/passthrough.rb +7 -0
  55. data/lib/chem_scanner/interpreter/element/arrow.rb +298 -0
  56. data/lib/chem_scanner/interpreter/element/atom.rb +134 -0
  57. data/lib/chem_scanner/interpreter/element/fragment.rb +59 -0
  58. data/lib/chem_scanner/interpreter/element/molecule.rb +473 -0
  59. data/lib/chem_scanner/interpreter/element/molecule_group.rb +34 -0
  60. data/lib/chem_scanner/interpreter/element/reaction.rb +186 -0
  61. data/lib/chem_scanner/interpreter/element/reaction_step.rb +39 -0
  62. data/lib/chem_scanner/interpreter/formula_to_mol.rb +75 -0
  63. data/lib/chem_scanner/interpreter/post_process/assemble.rb +38 -0
  64. data/lib/chem_scanner/interpreter/post_process/label_by_molecule.rb +37 -0
  65. data/lib/chem_scanner/interpreter/post_process/reaction_info.rb +225 -0
  66. data/lib/chem_scanner/interpreter/post_process/reaction_step.rb +95 -0
  67. data/lib/chem_scanner/interpreter/post_process/reagent_label.rb +46 -0
  68. data/lib/chem_scanner/interpreter/post_process/text_as_molecule.rb +52 -0
  69. data/lib/chem_scanner/interpreter/post_process/text_label.rb +40 -0
  70. data/lib/chem_scanner/interpreter/pre_process/arrow.rb +197 -0
  71. data/lib/chem_scanner/interpreter/pre_process/graphic.rb +41 -0
  72. data/lib/chem_scanner/interpreter/pre_process/molecule.rb +150 -0
  73. data/lib/chem_scanner/interpreter/reaction_detection/assign_to_reaction.rb +129 -0
  74. data/lib/chem_scanner/interpreter/reaction_detection/duplicate_reagents.rb +50 -0
  75. data/lib/chem_scanner/interpreter/reaction_detection/molecule_group.rb +55 -0
  76. data/lib/chem_scanner/interpreter/reaction_detection/multi_line_chain_reaction.rb +85 -0
  77. data/lib/chem_scanner/interpreter/reaction_detection/remove_separated_mol.rb +115 -0
  78. data/lib/chem_scanner/interpreter/reaction_detection/text_assignment.rb +166 -0
  79. data/lib/chem_scanner/interpreter/scheme.rb +173 -0
  80. data/lib/chem_scanner/interpreter/scheme_base.rb +64 -0
  81. data/lib/chem_scanner/interpreter/text_group/bold_groups.rb +183 -0
  82. data/lib/chem_scanner/interpreter/text_group/molecule_text_group.rb +138 -0
  83. data/lib/chem_scanner/interpreter/text_group/reaction_text_groups.rb +221 -0
  84. data/lib/chem_scanner/interpreter/text_group/retrieve_alias_info.rb +41 -0
  85. data/lib/chem_scanner/interpreter/text_group/retrieve_n_atoms.rb +106 -0
  86. data/lib/chem_scanner/interpreter/text_group/text_group_interpreter.rb +92 -0
  87. data/lib/chem_scanner/perkin_eln.rb +287 -0
  88. data/lib/chem_scanner/version.rb +5 -0
  89. data/lib/rubygems_plugin.rb +5 -0
  90. metadata +244 -0
data/README.md ADDED
@@ -0,0 +1,177 @@
1
+
2
+ # Introduction
3
+
4
+ The `ChemScanner` library attempts to extract and interpret reactions/molecules information from ChemDraw-related files format: CDX, CDXML, embedded CDX within DOC and DOCX, [Perkin Elmer ELN](http://www.perkinelmer.com/category/notebook).
5
+
6
+ # Installation
7
+
8
+ Add this line to your application's Gemfile:
9
+
10
+ ```ruby
11
+ gem 'chem_scanner'
12
+ ```
13
+
14
+ And then execute:
15
+
16
+ $ bundle
17
+
18
+ Or install it yourself as:
19
+
20
+ $ gem install chem_scanner
21
+
22
+ # UI for ChemScanner
23
+ You can try the `ChemScanner` at https://eln.chemotion.net/ or https://eln.chemotion.net/chemscanner. The UI is more user-friendly which some additional features:
24
+
25
+ - Export to Excel and CML.
26
+ - Preview of the original scheme.
27
+ - Import directly to [Chemotion ELN](https://eln.chemotion.net)
28
+ - Add comment for each extracted scheme. These comments would also appear in the export and Chemotion ELN imported molecules/reactions.
29
+ - ...
30
+
31
+ # Usage
32
+
33
+ To scan/extract a single CDX file
34
+
35
+ ```ruby
36
+ require 'chem_scanner'
37
+
38
+ cdx = ChemScanner::Cdx.new
39
+ cdx.read('/path/to/cdx/file')
40
+ # Get array of scanned Canonical SMILES
41
+ cdx.molecules.map(&:get_cano_smiles)
42
+ # Get array of scanned Reactions in SMILES
43
+ cdx.reactions.map(&:reaction_smiles)
44
+ ```
45
+ There are 5 classes correspond to 5 supported file formats: CDX, CDXML, DOC, DOCX, PerkinELN.
46
+
47
+ # API
48
+
49
+ ## Molecule
50
+
51
+ - Access "scanned" molecules
52
+
53
+ ```ruby
54
+ # Molecules - array of scanned molecules
55
+ cdx.molecules
56
+ # Get array of scanned Canonical SMILES
57
+ cdx.molecules.map(&:get_cano_smiles)
58
+ # Get one molecule
59
+ molecule = cdx.molecules.first
60
+ # Number of scanned molecules
61
+ cdx.molecules.count
62
+ ```
63
+
64
+ - Molecule class:
65
+
66
+ ```ruby
67
+ # Canonical SMILES
68
+ molecule.get_cano_smiles
69
+ # Molfile
70
+ molecule.get_mdl
71
+ # RDKIT RWMol (https://www.rdkit.org/docs/cppapi/classRDKit_1_1RWMol.html)
72
+ molecule.rw_mol
73
+ # Molecule label (bold text near molecule)
74
+ molecule.label
75
+ # Molecule text (molecule description)
76
+ molecule.text
77
+ # Molecule details (additional information from Perkin Elmer ELN)
78
+ molecule.details
79
+ ```
80
+ We are using a [ruby-binding version](https://github.com/CamAnNguyen/rdkit_chem) of `RDKit` as a dependency of `ChemScanner`.
81
+
82
+ ## Reaction
83
+
84
+ Reaction consist of 3 groups of molecules: `reactants`, `reagents` and `products`. Each group is and array of molecules, which each element is an object of `Molecule` class. In addition, some abbreviations belong to the reaction are represented by SMILES. Those could be access via `reagent_smiles`
85
+
86
+ ```ruby
87
+ reaction = cdx.reactions.first
88
+ # Access extracted structure group
89
+ reactants = reaction.reactants
90
+ reagents = reaction.reagents
91
+ products = reaction.products
92
+ reagent_smiles = reaction.reagent_smiles
93
+ ```
94
+
95
+ Further manipulation of each group would be similar to `Molecule` class.
96
+
97
+ - **Reaction properties**
98
+
99
+ Reaction itself has `description`, `yield`, `time`, `temperature` and `details` properties. All these properties are extracted from the ChemDraw scheme, excep `details` field are additional information from `PerkinELN`.
100
+
101
+ - **Reaction step**
102
+
103
+ Some multi-step reactions can also be recognized. If a reaction is a multi-step reaction, the "steps" could be accessed via:
104
+
105
+ ```ruby
106
+ # Get first scanned reaction
107
+ reaction = cdx.reactions.first
108
+ # Access first step
109
+ step = reaction.steps.first
110
+ step.number # Should be 1
111
+ step.description
112
+ step.time
113
+ step.temperature
114
+ # List reagents SMILES
115
+ step.reagents
116
+ ```
117
+
118
+ Each step has these following properties: `description`, `time`, `temperature`, and `reagents`
119
+
120
+ ## Supported File Formats
121
+
122
+ CDX, CDXML, PerkinELN usage and API are described above. Their outputs are simple `molecules` and `reactions`.
123
+
124
+ DOC and DOCX classes are little bit different. Since DOC and DOCX file can contain more than 1 embedded ChemDraw schemes, which each embedded scheme is 1 CDX scheme.
125
+ `ChemScanner` attempts to extract all of them and put into one `Hash` map, called `cdx_map`.
126
+
127
+ ```ruby
128
+ require 'chem_scanner'
129
+
130
+ doc = ChemScanner::Doc.new
131
+ doc.read('/path/to/doc/file')
132
+ doc.cdx_map.each do |key, cdx|
133
+ puts cdx.reactions.map(&:reaction_smiles)
134
+ end
135
+
136
+ # Access all molecules in all CDXs
137
+ doc.molecules.map(&:get_cano_smiles)
138
+ # Access all reactions in all CDXs
139
+ doc.reactions.map(&:get_cano_smiles)
140
+ ```
141
+
142
+ DOCX is a bit different, `ChemScanner` can extract the CDX together with its preview image within the documents.
143
+
144
+ ```ruby
145
+ require 'chem_scanner'
146
+
147
+ docx = ChemScanner::Docx.new
148
+ docx.read('/path/to/docx/file')
149
+ docx.cdx_map.each do |key, cdx_info|
150
+ # Get the CDX scheme
151
+ cdx = cdx_info[:cdx]
152
+ puts cdx.reactions.map(&:reaction_smiles)
153
+
154
+ # Preview images, used for ChemScanner UI
155
+ img_ext = cdx_info[:img_ext] # Could be '.png', '.emf'
156
+ img_b64 = cdx_info[:img_b64] # Base64 encoded of image
157
+ end
158
+
159
+ # Access all molecules in all CDXs
160
+ docx.molecules.map(&:get_cano_smiles)
161
+ # Access all reactions in all CDXs
162
+ docx.reactions.map(&:get_cano_smiles)
163
+ ```
164
+
165
+ # Development
166
+
167
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
168
+
169
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
170
+
171
+ # Contributing
172
+
173
+ Bug reports and pull requests are welcome on GitHub at https://github.com/CamAnNguyen/chem_scanner. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [Contributor Covenant](http://contributor-covenant.org) code of conduct.
174
+
175
+ # License
176
+
177
+ The gem is available as open source under the terms of the [GNU AGPLv3 License](https://www.gnu.org/licenses/agpl-3.0.en.html).
data/Rakefile ADDED
@@ -0,0 +1,8 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "bundler/gem_tasks"
4
+ require "rspec/core/rake_task"
5
+
6
+ RSpec::Core::RakeTask.new(:spec)
7
+
8
+ task default: :spec
data/bin/console ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "chem_scanner"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start(__FILE__)
data/bin/setup ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,43 @@
1
+ # frozen_string_literal: true
2
+
3
+ lib = File.expand_path("lib", __dir__)
4
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
5
+
6
+ require "chem_scanner/version"
7
+
8
+ # rubocop:disable Metrics/BlockLength
9
+ Gem::Specification.new do |spec|
10
+ spec.name = "chem_scanner"
11
+ spec.version = ChemScanner::VERSION
12
+ spec.authors = ["an.nguyen"]
13
+ spec.email = ["an.nguyen@kit.edu"]
14
+
15
+ spec.summary = "Extraction of chemical information"
16
+ spec.description = "ChemScanner is a chemical utiliy to extract " \
17
+ "chemical information from various scientific formats"
18
+ spec.homepage = "https://chemotion.net"
19
+ spec.license = "MIT"
20
+
21
+ spec.files = Dir.chdir(File.expand_path(__dir__)) do
22
+ `git ls-files -z`.split("\x0").reject do |f|
23
+ f.match(%r{^(test|spec|features)/})
24
+ end
25
+ end
26
+
27
+ spec.bindir = "exe"
28
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
29
+ spec.require_paths = ["lib"]
30
+
31
+ spec.required_ruby_version = ">= 2.3"
32
+
33
+ spec.add_development_dependency "bundler", ">= 1.16"
34
+ spec.add_development_dependency "rake", ">= 10.0"
35
+ spec.add_development_dependency "rspec", ">= 3.0"
36
+
37
+ spec.add_dependency "chronic_duration", ">= 0.10"
38
+ spec.add_dependency "nokogiri", ">= 1.8"
39
+ spec.add_dependency "rdkit_chem"
40
+ spec.add_dependency "ruby-geometry", ">= 0.0.6"
41
+ spec.add_dependency "ruby-ole", ">= 1.2"
42
+ end
43
+ # rubocop:enable Metrics/BlockLength
@@ -0,0 +1,79 @@
1
+ require "yaml"
2
+ require "rdkit_chem"
3
+ require "ostruct"
4
+ require "forwardable"
5
+
6
+ # ChemScanner main module
7
+ module ChemScanner
8
+ (
9
+ Gem.find_files("chem_scanner/extension/*/*.rb") +
10
+ Gem.find_files("chem_scanner/extension/*.rb") +
11
+ Gem.find_files("chem_scanner/configuration/*.rb")
12
+ ).each { |file| require file }
13
+
14
+ @superatom = Superatom.instance
15
+ @abbreviation = Abbreviation.instance
16
+
17
+ def self.sync_custom_superatom
18
+ @superatom.sync_custom
19
+ end
20
+
21
+ def self.all_superatoms
22
+ @superatom.all
23
+ end
24
+
25
+ def self.predefined_superatoms
26
+ @superatom.predefined
27
+ end
28
+
29
+ def self.custom_superatoms
30
+ @superatom.custom
31
+ end
32
+
33
+ def self.get_superatom(superatom)
34
+ @superatom.get_superatom(superatom)
35
+ end
36
+
37
+ def self.add_superatom(satom, smi)
38
+ @superatom.add(satom, smi)
39
+ end
40
+
41
+ def self.remove_superatom(satom)
42
+ @superatom.remove(satom)
43
+ end
44
+
45
+ def self.predefined_abbreviations
46
+ @abbreviation.predefined
47
+ end
48
+
49
+ def self.solvents
50
+ @abbreviation.solvents
51
+ end
52
+
53
+ def self.all_abbreviations
54
+ @abbreviation.all
55
+ end
56
+
57
+ def self.get_abbreviation(abb)
58
+ if @superatom.get_superatom(abb).empty?
59
+ @abbreviation.get_abbreviation(abb)
60
+ else
61
+ ""
62
+ end
63
+ end
64
+
65
+ def self.add_abbreviation(abb, smi)
66
+ @abbreviation.add(abb, smi)
67
+ end
68
+
69
+ def self.add_abbreviation_hash(hash)
70
+ @abbreviation.add_hash(hash)
71
+ end
72
+
73
+ def self.remove_abbreviation(abb)
74
+ @abbreviation.remove(abb)
75
+ end
76
+ end
77
+
78
+ Gem.find_files("chem_scanner/*.rb").each { |file| require file }
79
+ Gem.find_files("chem_scanner/export/*.rb").each { |file| require file }
@@ -0,0 +1,67 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Top module
4
+ module ChemScanner
5
+ require "chem_scanner/chem_draw/parser"
6
+ require "chem_scanner/chem_draw/cdx_reader"
7
+
8
+ # Class which traverse the tree in CDX binary files and parse
9
+ class Cdx < ChemDraw::Parser
10
+ attr_accessor :version
11
+
12
+ CREATIONPROGRAM = 0x0003
13
+ COLORTABLE = 0x0300
14
+ FONTTABLE = 0x0100
15
+
16
+ def initialize
17
+ super
18
+
19
+ @type = "cdx"
20
+ end
21
+
22
+ def read(file, is_path = true)
23
+ @reader = ChemDraw::CdxReader.new(file, is_path)
24
+ return false unless @reader.valid
25
+
26
+ read_global
27
+ read_objects until @reader.end?
28
+ rebuild_objects_map
29
+
30
+ @scheme = Interpreter::Scheme.new(self)
31
+ @scheme.interpret
32
+
33
+ @molecules = @scheme.molecules
34
+ @reactions = @scheme.reactions
35
+
36
+ true
37
+ end
38
+
39
+ def raw_data
40
+ @reader.bin
41
+ end
42
+
43
+ private
44
+
45
+ def read_global
46
+ tag = @reader.read_next until tag == CREATIONPROGRAM
47
+ @version = @reader.data.split(" ").last
48
+
49
+ tag = @reader.read_next until tag == COLORTABLE
50
+ @color_table = read_colortable(@reader.data, "cdx")
51
+
52
+ tag = @reader.read_next until tag == FONTTABLE
53
+ @font_table = read_fonttable(@reader.data, "cdx")
54
+ end
55
+
56
+ def read_objects
57
+ tag = @reader.read_next(true)
58
+
59
+ while tag.positive?
60
+ cid = @reader.current_id
61
+ parse_object(tag, cid)
62
+
63
+ tag = @reader.read_next(true)
64
+ end
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,72 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Main module
4
+ module ChemScanner
5
+ require "nokogiri"
6
+ require "chem_scanner/chem_draw/parser"
7
+
8
+ # Read and Parse CDXML
9
+ class Cdxml < ChemDraw::Parser
10
+ attr_accessor :version, :reader
11
+
12
+ CDXML_DOCTYPE = "http://www.cambridgesoft.com/xml/cdxml.dtd"
13
+
14
+ def initialize
15
+ super
16
+
17
+ @type = "cdxml"
18
+ end
19
+
20
+ def read(file, is_path = true)
21
+ fs = is_path ? File.open(file) : file
22
+ @cdxml = Nokogiri::XML(fs)
23
+ return false if @cdxml.internal_subset.system_id != CDXML_DOCTYPE
24
+
25
+ read_global
26
+
27
+ @cdxml.xpath("//page").each do |page|
28
+ @reader = page
29
+ read_objects
30
+ end
31
+
32
+ rebuild_objects_map
33
+
34
+ @scheme = Interpreter::Scheme.new(self)
35
+ @scheme.interpret
36
+
37
+ @molecules = @scheme.molecules
38
+ @reactions = @scheme.reactions
39
+
40
+ true
41
+ end
42
+
43
+ def raw_data
44
+ @cdxml.to_xml
45
+ end
46
+
47
+ def read_global
48
+ @version = @cdxml.xpath("//CDXML/@CreationProgram").text.split(" ").last
49
+
50
+ ct = @cdxml.xpath("//CDXML/colortable").first
51
+ @color_table = read_colortable(ct, "cdxml")
52
+
53
+ ft = @cdxml.xpath("//CDXML/fonttable").first
54
+ @font_table = read_fonttable(ft, "cdxml")
55
+ end
56
+
57
+ def read_objects
58
+ nodes = @reader.element_children
59
+
60
+ nodes.each do |node|
61
+ @reader = node
62
+ nid = (node.attr("id") || 0).to_i
63
+
64
+ if ChemDraw::CDXML_OBJ[node.name] == "Group"
65
+ read_objects
66
+ else
67
+ parse_object(node.name, nid)
68
+ end
69
+ end
70
+ end
71
+ end
72
+ end