edlib 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ad1a2f931b436577baac1923b324837f50aa133efbe48843f0956711b6e3fe4c
4
- data.tar.gz: 62c9924efcf428902528e6ce29e1be6d7d874bd5db406b29bdeee00b3491e800
3
+ metadata.gz: 1a7ecc7eca8aef827a022a66bcf9c540e24f0e3d070b8af5d37913f112f68b0d
4
+ data.tar.gz: 6c25316af8bc413b4c014660e9bb7769a139ecd537983a67ab4a249fcaa31faf
5
5
  SHA512:
6
- metadata.gz: ca119f2d471a21d8cb8da7820d2a1723396040d8a62c18c4cbcdb9b7c8879ed7a8792d57193e110a122c3b9d2d5b883aeab869cdc7d465e4aeec687c23896c5d
7
- data.tar.gz: 596c89f8fe1112bb876edfb89849d6f634b4aa1d84ea3f47a6a24dbdb78070f86f063a0d38ef5aaccc4e4ea66edfd5d9c699ccb778dcf0f8812794018be944e7
6
+ metadata.gz: 0e63d0dbdb063833d7bea3e0ee53ca0ae0daa1e5828d0c022e9d7da8c5c3f3a3ce8dc16418fd7b925894767bdc053cebc7e85c214f5541d2d4c2d40434ef4dd9
7
+ data.tar.gz: c666e503c1af8de0d91cc8acce4d253f5e73f84137cc2b2436b936a145c552997db518c8193ba917c6ccd6f14134f80e464bb6e6f1e6b23f751413529d00c454
data/README.md CHANGED
@@ -1,22 +1,33 @@
1
1
  # ruby-edlib
2
2
 
3
+ [![Gem Version](https://badge.fury.io/rb/edlib.svg)](https://badge.fury.io/rb/edlib)
4
+ [![test](https://github.com/kojix2/ruby-edlib/actions/workflows/ci.yml/badge.svg)](https://github.com/kojix2/ruby-edlib/actions/workflows/ci.yml)
5
+
3
6
  [Edlib](https://github.com/Martinsos/edlib) - A lightweight and super fast C/C++ library for sequence alignment using edit distance
4
7
 
5
8
  ## Installation
6
9
 
7
10
  ```
8
- sudo apt install edlib-dev
9
11
  gem install edlib
10
12
  ```
11
13
 
12
- ## API
14
+ The Gem compiles the edlib source code inside the gem at installation. If you want to use the latest edlib, see Development.
15
+
16
+ ## Usage
13
17
 
14
18
  ```ruby
15
19
  require "edlib"
16
20
 
17
21
  a = Edlib::Aligner.new(mode: :hw, task: :path)
18
22
  a.align("AACG", "TCAACCTG")
19
- # => {:edit_distance=>1, :alphabet_length=>4, :locations=>[[2, 4], [2, 5]], :alignment=>[0, 0, 0, 1], :cigar=>"3=1I"}
23
+
24
+ # {
25
+ # :edit_distance => 1,
26
+ # :alphabet_length => 4,
27
+ # :locations => [[2, 4], [2, 5]],
28
+ # :alignment => [0, 0, 0, 1],
29
+ # :cigar => "3=1I"
30
+ # }
20
31
  ```
21
32
 
22
33
  |keyword argument |description|
@@ -28,7 +39,7 @@ a.align("AACG", "TCAACCTG")
28
39
 
29
40
  ## Development
30
41
 
31
- * Pull requests welcome
42
+ Pull requests welcome!
32
43
 
33
44
  ```sh
34
45
  git clone https://github.com/kojix2/ruby-edlib # Please fork repo
@@ -37,3 +48,14 @@ bundle install
37
48
  bundle exec rake compile
38
49
  bundle exec rake test
39
50
  ```
51
+
52
+ Use latest edlib
53
+
54
+ ```sh
55
+ git clone https://github.com/kojix2/ruby-edlib
56
+ cd ruby-edlib
57
+ bundle install
58
+ bundle exec rake edlib:update # Download latest edlib.h and edlib.cpp
59
+ bundle exec rake compile
60
+ bundle exec rake test
61
+ ```
data/ext/edlib/edlib.h ADDED
@@ -0,0 +1,277 @@
1
+ #ifndef EDLIB_H
2
+ #define EDLIB_H
3
+
4
+ /**
5
+ * @file
6
+ * @author Martin Sosic
7
+ * @brief Main header file, containing all public functions and structures.
8
+ */
9
+
10
+ // Define EDLIB_API macro to properly export symbols
11
+ #ifdef EDLIB_SHARED
12
+ # ifdef _WIN32
13
+ # ifdef EDLIB_BUILD
14
+ # define EDLIB_API __declspec(dllexport)
15
+ # else
16
+ # define EDLIB_API __declspec(dllimport)
17
+ # endif
18
+ # else
19
+ # define EDLIB_API __attribute__ ((visibility ("default")))
20
+ # endif
21
+ #else
22
+ # define EDLIB_API
23
+ #endif
24
+
25
+ #ifdef __cplusplus
26
+ extern "C" {
27
+ #endif
28
+
29
+ // Status codes
30
+ #define EDLIB_STATUS_OK 0
31
+ #define EDLIB_STATUS_ERROR 1
32
+
33
+ /**
34
+ * Alignment methods - how should Edlib treat gaps before and after query?
35
+ */
36
+ typedef enum {
37
+ /**
38
+ * Global method. This is the standard method.
39
+ * Useful when you want to find out how similar is first sequence to second sequence.
40
+ */
41
+ EDLIB_MODE_NW,
42
+ /**
43
+ * Prefix method. Similar to global method, but with a small twist - gap at query end is not penalized.
44
+ * What that means is that deleting elements from the end of second sequence is "free"!
45
+ * For example, if we had "AACT" and "AACTGGC", edit distance would be 0, because removing "GGC" from the end
46
+ * of second sequence is "free" and does not count into total edit distance. This method is appropriate
47
+ * when you want to find out how well first sequence fits at the beginning of second sequence.
48
+ */
49
+ EDLIB_MODE_SHW,
50
+ /**
51
+ * Infix method. Similar as prefix method, but with one more twist - gaps at query end and start are
52
+ * not penalized. What that means is that deleting elements from the start and end of second sequence is "free"!
53
+ * For example, if we had ACT and CGACTGAC, edit distance would be 0, because removing CG from the start
54
+ * and GAC from the end of second sequence is "free" and does not count into total edit distance.
55
+ * This method is appropriate when you want to find out how well first sequence fits at any part of
56
+ * second sequence.
57
+ * For example, if your second sequence was a long text and your first sequence was a sentence from that text,
58
+ * but slightly scrambled, you could use this method to discover how scrambled it is and where it fits in
59
+ * that text. In bioinformatics, this method is appropriate for aligning read to a sequence.
60
+ */
61
+ EDLIB_MODE_HW
62
+ } EdlibAlignMode;
63
+
64
+ /**
65
+ * Alignment tasks - what do you want Edlib to do?
66
+ */
67
+ typedef enum {
68
+ EDLIB_TASK_DISTANCE, //!< Find edit distance and end locations.
69
+ EDLIB_TASK_LOC, //!< Find edit distance, end locations and start locations.
70
+ EDLIB_TASK_PATH //!< Find edit distance, end locations and start locations and alignment path.
71
+ } EdlibAlignTask;
72
+
73
+ /**
74
+ * Describes cigar format.
75
+ * @see http://samtools.github.io/hts-specs/SAMv1.pdf
76
+ * @see http://drive5.com/usearch/manual/cigar.html
77
+ */
78
+ typedef enum {
79
+ EDLIB_CIGAR_STANDARD, //!< Match: 'M', Insertion: 'I', Deletion: 'D', Mismatch: 'M'.
80
+ EDLIB_CIGAR_EXTENDED //!< Match: '=', Insertion: 'I', Deletion: 'D', Mismatch: 'X'.
81
+ } EdlibCigarFormat;
82
+
83
+ // Edit operations.
84
+ #define EDLIB_EDOP_MATCH 0 //!< Match.
85
+ #define EDLIB_EDOP_INSERT 1 //!< Insertion to target = deletion from query.
86
+ #define EDLIB_EDOP_DELETE 2 //!< Deletion from target = insertion to query.
87
+ #define EDLIB_EDOP_MISMATCH 3 //!< Mismatch.
88
+
89
+ /**
90
+ * @brief Defines two given characters as equal.
91
+ */
92
+ typedef struct {
93
+ char first;
94
+ char second;
95
+ } EdlibEqualityPair;
96
+
97
+ /**
98
+ * @brief Configuration object for edlibAlign() function.
99
+ */
100
+ typedef struct {
101
+ /**
102
+ * Set k to non-negative value to tell edlib that edit distance is not larger than k.
103
+ * Smaller k can significantly improve speed of computation.
104
+ * If edit distance is larger than k, edlib will set edit distance to -1.
105
+ * Set k to negative value and edlib will internally auto-adjust k until score is found.
106
+ */
107
+ int k;
108
+
109
+ /**
110
+ * Alignment method.
111
+ * EDLIB_MODE_NW: global (Needleman-Wunsch)
112
+ * EDLIB_MODE_SHW: prefix. Gap after query is not penalized.
113
+ * EDLIB_MODE_HW: infix. Gaps before and after query are not penalized.
114
+ */
115
+ EdlibAlignMode mode;
116
+
117
+ /**
118
+ * Alignment task - tells Edlib what to calculate. Less to calculate, faster it is.
119
+ * EDLIB_TASK_DISTANCE - find edit distance and end locations of optimal alignment paths in target.
120
+ * EDLIB_TASK_LOC - find edit distance and start and end locations of optimal alignment paths in target.
121
+ * EDLIB_TASK_PATH - find edit distance, alignment path (and start and end locations of it in target).
122
+ */
123
+ EdlibAlignTask task;
124
+
125
+ /**
126
+ * List of pairs of characters, where each pair defines two characters as equal.
127
+ * This way you can extend edlib's definition of equality (which is that each character is equal only
128
+ * to itself).
129
+ * This can be useful if you have some wildcard characters that should match multiple other characters,
130
+ * or e.g. if you want edlib to be case insensitive.
131
+ * Can be set to NULL if there are none.
132
+ */
133
+ const EdlibEqualityPair* additionalEqualities;
134
+
135
+ /**
136
+ * Number of additional equalities, which is non-negative number.
137
+ * 0 if there are none.
138
+ */
139
+ int additionalEqualitiesLength;
140
+ } EdlibAlignConfig;
141
+
142
+ /**
143
+ * Helper method for easy construction of configuration object.
144
+ * @return Configuration object filled with given parameters.
145
+ */
146
+ EDLIB_API EdlibAlignConfig edlibNewAlignConfig(
147
+ int k, EdlibAlignMode mode, EdlibAlignTask task,
148
+ const EdlibEqualityPair* additionalEqualities,
149
+ int additionalEqualitiesLength
150
+ );
151
+
152
+ /**
153
+ * @return Default configuration object, with following defaults:
154
+ * k = -1, mode = EDLIB_MODE_NW, task = EDLIB_TASK_DISTANCE, no additional equalities.
155
+ */
156
+ EDLIB_API EdlibAlignConfig edlibDefaultAlignConfig(void);
157
+
158
+
159
+ /**
160
+ * Container for results of alignment done by edlibAlign() function.
161
+ */
162
+ typedef struct {
163
+ /**
164
+ * EDLIB_STATUS_OK or EDLIB_STATUS_ERROR. If error, all other fields will have undefined values.
165
+ */
166
+ int status;
167
+
168
+ /**
169
+ * -1 if k is non-negative and edit distance is larger than k.
170
+ */
171
+ int editDistance;
172
+
173
+ /**
174
+ * Array of zero-based positions in target where optimal alignment paths end.
175
+ * If gap after query is penalized, gap counts as part of query (NW), otherwise not.
176
+ * Set to NULL if edit distance is larger than k.
177
+ * If you do not free whole result object using edlibFreeAlignResult(), do not forget to use free().
178
+ */
179
+ int* endLocations;
180
+
181
+ /**
182
+ * Array of zero-based positions in target where optimal alignment paths start,
183
+ * they correspond to endLocations.
184
+ * If gap before query is penalized, gap counts as part of query (NW), otherwise not.
185
+ * Set to NULL if not calculated or if edit distance is larger than k.
186
+ * If you do not free whole result object using edlibFreeAlignResult(), do not forget to use free().
187
+ */
188
+ int* startLocations;
189
+
190
+ /**
191
+ * Number of end (and start) locations.
192
+ */
193
+ int numLocations;
194
+
195
+ /**
196
+ * Alignment is found for first pair of start and end locations.
197
+ * Set to NULL if not calculated.
198
+ * Alignment is sequence of numbers: 0, 1, 2, 3.
199
+ * 0 stands for match.
200
+ * 1 stands for insertion to target.
201
+ * 2 stands for insertion to query.
202
+ * 3 stands for mismatch.
203
+ * Alignment aligns query to target from begining of query till end of query.
204
+ * If gaps are not penalized, they are not in alignment.
205
+ * If you do not free whole result object using edlibFreeAlignResult(), do not forget to use free().
206
+ */
207
+ unsigned char* alignment;
208
+
209
+ /**
210
+ * Length of alignment.
211
+ */
212
+ int alignmentLength;
213
+
214
+ /**
215
+ * Number of different characters in query and target together.
216
+ */
217
+ int alphabetLength;
218
+ } EdlibAlignResult;
219
+
220
+ /**
221
+ * Frees memory in EdlibAlignResult that was allocated by edlib.
222
+ * If you do not use it, make sure to free needed members manually using free().
223
+ */
224
+ EDLIB_API void edlibFreeAlignResult(EdlibAlignResult result);
225
+
226
+
227
+ /**
228
+ * Aligns two sequences (query and target) using edit distance (levenshtein distance).
229
+ * Through config parameter, this function supports different alignment methods (global, prefix, infix),
230
+ * as well as different modes of search (tasks).
231
+ * It always returns edit distance and end locations of optimal alignment in target.
232
+ * It optionally returns start locations of optimal alignment in target and alignment path,
233
+ * if you choose appropriate tasks.
234
+ * @param [in] query First sequence.
235
+ * @param [in] queryLength Number of characters in first sequence.
236
+ * @param [in] target Second sequence.
237
+ * @param [in] targetLength Number of characters in second sequence.
238
+ * @param [in] config Additional alignment parameters, like alignment method and wanted results.
239
+ * @return Result of alignment, which can contain edit distance, start and end locations and alignment path.
240
+ * Make sure to clean up the object using edlibFreeAlignResult() or by manually freeing needed members.
241
+ */
242
+ EDLIB_API EdlibAlignResult edlibAlign(
243
+ const char* query, int queryLength,
244
+ const char* target, int targetLength,
245
+ const EdlibAlignConfig config
246
+ );
247
+
248
+
249
+ /**
250
+ * Builds cigar string from given alignment sequence.
251
+ * @param [in] alignment Alignment sequence.
252
+ * 0 stands for match.
253
+ * 1 stands for insertion to target.
254
+ * 2 stands for insertion to query.
255
+ * 3 stands for mismatch.
256
+ * @param [in] alignmentLength
257
+ * @param [in] cigarFormat Cigar will be returned in specified format.
258
+ * @return Cigar string.
259
+ * I stands for insertion.
260
+ * D stands for deletion.
261
+ * X stands for mismatch. (used only in extended format)
262
+ * = stands for match. (used only in extended format)
263
+ * M stands for (mis)match. (used only in standard format)
264
+ * String is null terminated.
265
+ * Needed memory is allocated and given pointer is set to it.
266
+ * Do not forget to free it later using free()!
267
+ */
268
+ EDLIB_API char* edlibAlignmentToCigar(
269
+ const unsigned char* alignment, int alignmentLength,
270
+ EdlibCigarFormat cigarFormat
271
+ );
272
+
273
+ #ifdef __cplusplus
274
+ }
275
+ #endif
276
+
277
+ #endif // EDLIB_H
@@ -92,9 +92,14 @@ aligner_get_mode(VALUE self)
92
92
  static VALUE
93
93
  set_mode(EdlibAlignConfig *config, VALUE mode)
94
94
  {
95
+ if (TYPE(mode) == T_SYMBOL)
96
+ {
97
+ mode = rb_String(mode);
98
+ }
95
99
  switch (TYPE(mode))
96
100
  {
97
- case T_STRING:
101
+ case T_STRING:;
102
+ rb_funcall(mode, rb_intern("upcase!"), 0);
98
103
  if (strcmp(RSTRING_PTR(mode), "NW") == 0)
99
104
  {
100
105
  config->mode = 0;
@@ -118,7 +123,7 @@ set_mode(EdlibAlignConfig *config, VALUE mode)
118
123
  {
119
124
  rb_raise(rb_eArgError, "Invalid mode");
120
125
  }
121
- config->mode = NUM2INT(mode);
126
+ config->mode = m;
122
127
  break;
123
128
  default:
124
129
  rb_raise(rb_eArgError, "Invalid mode");
@@ -159,9 +164,14 @@ aligner_get_task(VALUE self)
159
164
  static VALUE
160
165
  set_task(EdlibAlignConfig *config, VALUE task)
161
166
  {
167
+ if (TYPE(task) == T_SYMBOL)
168
+ {
169
+ task = rb_String(task);
170
+ }
162
171
  switch (TYPE(task))
163
172
  {
164
- case T_STRING:
173
+ case T_STRING:;
174
+ rb_funcall(task, rb_intern("upcase!"), 0);
165
175
  if (strcmp(RSTRING_PTR(task), "DISTANCE") == 0)
166
176
  {
167
177
  config->task = 0;
@@ -185,7 +195,7 @@ set_task(EdlibAlignConfig *config, VALUE task)
185
195
  {
186
196
  rb_raise(rb_eArgError, "Invalid task");
187
197
  }
188
- config->task = NUM2INT(task);
198
+ config->task = t;
189
199
  break;
190
200
  default:
191
201
  rb_raise(rb_eArgError, "Invalid task");
@@ -381,7 +391,7 @@ aligner_align(VALUE self, VALUE query, VALUE target)
381
391
  return hash;
382
392
  }
383
393
 
384
- void Init_edlib(void)
394
+ void Init_edlibext(void)
385
395
  {
386
396
  mEdlib = rb_define_module("Edlib");
387
397
  cAligner = rb_define_class_under(mEdlib, "Aligner", rb_cObject);
data/ext/edlib/extconf.rb CHANGED
@@ -1,6 +1,3 @@
1
- require "mkmf"
1
+ require 'mkmf'
2
2
 
3
- dir_config('edlib')
4
- if have_header('edlib.h') and have_library('edlib')
5
- create_makefile('edlib/edlib')
6
- end
3
+ create_makefile('edlib/edlibext')
data/lib/edlib/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Edlib
2
- VERSION = "0.0.1"
3
- end
2
+ VERSION = '0.0.2'
3
+ end
data/lib/edlib.rb CHANGED
@@ -1,8 +1,8 @@
1
- require_relative 'edlib/edlib'
1
+ require_relative 'edlib/edlibext'
2
2
 
3
3
  module Edlib
4
4
  class Aligner
5
- def initialize(k:-1, mode: 'NW', task: 'DISTANCE', additional_equalities: nil)
5
+ def initialize(k: -1, mode: 'NW', task: 'DISTANCE', additional_equalities: nil)
6
6
  mode = mode.to_s if mode.is_a? Symbol
7
7
  task = task.to_s if task.is_a? Symbol
8
8
  mode = mode.upcase if mode.is_a? String
@@ -10,4 +10,4 @@ module Edlib
10
10
  initialize_raw(k, mode, task, additional_equalities)
11
11
  end
12
12
  end
13
- end
13
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: edlib
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - kojix2
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2022-10-30 00:00:00.000000000 Z
11
+ date: 2022-10-31 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: 'Lightweight, super fast C/C++ library for sequence alignment using edit
14
14
  (Levenshtein) distance. '
@@ -20,7 +20,8 @@ extensions:
20
20
  extra_rdoc_files: []
21
21
  files:
22
22
  - README.md
23
- - ext/edlib/edlib.c
23
+ - ext/edlib/edlib.h
24
+ - ext/edlib/edlibext.c
24
25
  - ext/edlib/extconf.rb
25
26
  - lib/edlib.rb
26
27
  - lib/edlib/version.rb
@@ -28,7 +29,7 @@ homepage: https://github.com/kojix2/ruby-edlib
28
29
  licenses:
29
30
  - MIT
30
31
  metadata: {}
31
- post_install_message:
32
+ post_install_message:
32
33
  rdoc_options: []
33
34
  require_paths:
34
35
  - lib
@@ -44,7 +45,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
44
45
  version: '0'
45
46
  requirements: []
46
47
  rubygems_version: 3.3.7
47
- signing_key:
48
+ signing_key:
48
49
  specification_version: 4
49
50
  summary: ruby-edlib is a wrapper for edlib.
50
51
  test_files: []