edlib 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ad1a2f931b436577baac1923b324837f50aa133efbe48843f0956711b6e3fe4c
4
- data.tar.gz: 62c9924efcf428902528e6ce29e1be6d7d874bd5db406b29bdeee00b3491e800
3
+ metadata.gz: 1a7ecc7eca8aef827a022a66bcf9c540e24f0e3d070b8af5d37913f112f68b0d
4
+ data.tar.gz: 6c25316af8bc413b4c014660e9bb7769a139ecd537983a67ab4a249fcaa31faf
5
5
  SHA512:
6
- metadata.gz: ca119f2d471a21d8cb8da7820d2a1723396040d8a62c18c4cbcdb9b7c8879ed7a8792d57193e110a122c3b9d2d5b883aeab869cdc7d465e4aeec687c23896c5d
7
- data.tar.gz: 596c89f8fe1112bb876edfb89849d6f634b4aa1d84ea3f47a6a24dbdb78070f86f063a0d38ef5aaccc4e4ea66edfd5d9c699ccb778dcf0f8812794018be944e7
6
+ metadata.gz: 0e63d0dbdb063833d7bea3e0ee53ca0ae0daa1e5828d0c022e9d7da8c5c3f3a3ce8dc16418fd7b925894767bdc053cebc7e85c214f5541d2d4c2d40434ef4dd9
7
+ data.tar.gz: c666e503c1af8de0d91cc8acce4d253f5e73f84137cc2b2436b936a145c552997db518c8193ba917c6ccd6f14134f80e464bb6e6f1e6b23f751413529d00c454
data/README.md CHANGED
@@ -1,22 +1,33 @@
1
1
  # ruby-edlib
2
2
 
3
+ [![Gem Version](https://badge.fury.io/rb/edlib.svg)](https://badge.fury.io/rb/edlib)
4
+ [![test](https://github.com/kojix2/ruby-edlib/actions/workflows/ci.yml/badge.svg)](https://github.com/kojix2/ruby-edlib/actions/workflows/ci.yml)
5
+
3
6
  [Edlib](https://github.com/Martinsos/edlib) - A lightweight and super fast C/C++ library for sequence alignment using edit distance
4
7
 
5
8
  ## Installation
6
9
 
7
10
  ```
8
- sudo apt install edlib-dev
9
11
  gem install edlib
10
12
  ```
11
13
 
12
- ## API
14
+ The Gem compiles the edlib source code inside the gem at installation. If you want to use the latest edlib, see Development.
15
+
16
+ ## Usage
13
17
 
14
18
  ```ruby
15
19
  require "edlib"
16
20
 
17
21
  a = Edlib::Aligner.new(mode: :hw, task: :path)
18
22
  a.align("AACG", "TCAACCTG")
19
- # => {:edit_distance=>1, :alphabet_length=>4, :locations=>[[2, 4], [2, 5]], :alignment=>[0, 0, 0, 1], :cigar=>"3=1I"}
23
+
24
+ # {
25
+ # :edit_distance => 1,
26
+ # :alphabet_length => 4,
27
+ # :locations => [[2, 4], [2, 5]],
28
+ # :alignment => [0, 0, 0, 1],
29
+ # :cigar => "3=1I"
30
+ # }
20
31
  ```
21
32
 
22
33
  |keyword argument |description|
@@ -28,7 +39,7 @@ a.align("AACG", "TCAACCTG")
28
39
 
29
40
  ## Development
30
41
 
31
- * Pull requests welcome
42
+ Pull requests welcome!
32
43
 
33
44
  ```sh
34
45
  git clone https://github.com/kojix2/ruby-edlib # Please fork repo
@@ -37,3 +48,14 @@ bundle install
37
48
  bundle exec rake compile
38
49
  bundle exec rake test
39
50
  ```
51
+
52
+ Use latest edlib
53
+
54
+ ```sh
55
+ git clone https://github.com/kojix2/ruby-edlib
56
+ cd ruby-edlib
57
+ bundle install
58
+ bundle exec rake edlib:update # Download latest edlib.h and edlib.cpp
59
+ bundle exec rake compile
60
+ bundle exec rake test
61
+ ```
data/ext/edlib/edlib.h ADDED
@@ -0,0 +1,277 @@
1
+ #ifndef EDLIB_H
2
+ #define EDLIB_H
3
+
4
+ /**
5
+ * @file
6
+ * @author Martin Sosic
7
+ * @brief Main header file, containing all public functions and structures.
8
+ */
9
+
10
+ // Define EDLIB_API macro to properly export symbols
11
+ #ifdef EDLIB_SHARED
12
+ # ifdef _WIN32
13
+ # ifdef EDLIB_BUILD
14
+ # define EDLIB_API __declspec(dllexport)
15
+ # else
16
+ # define EDLIB_API __declspec(dllimport)
17
+ # endif
18
+ # else
19
+ # define EDLIB_API __attribute__ ((visibility ("default")))
20
+ # endif
21
+ #else
22
+ # define EDLIB_API
23
+ #endif
24
+
25
+ #ifdef __cplusplus
26
+ extern "C" {
27
+ #endif
28
+
29
+ // Status codes
30
+ #define EDLIB_STATUS_OK 0
31
+ #define EDLIB_STATUS_ERROR 1
32
+
33
+ /**
34
+ * Alignment methods - how should Edlib treat gaps before and after query?
35
+ */
36
+ typedef enum {
37
+ /**
38
+ * Global method. This is the standard method.
39
+ * Useful when you want to find out how similar is first sequence to second sequence.
40
+ */
41
+ EDLIB_MODE_NW,
42
+ /**
43
+ * Prefix method. Similar to global method, but with a small twist - gap at query end is not penalized.
44
+ * What that means is that deleting elements from the end of second sequence is "free"!
45
+ * For example, if we had "AACT" and "AACTGGC", edit distance would be 0, because removing "GGC" from the end
46
+ * of second sequence is "free" and does not count into total edit distance. This method is appropriate
47
+ * when you want to find out how well first sequence fits at the beginning of second sequence.
48
+ */
49
+ EDLIB_MODE_SHW,
50
+ /**
51
+ * Infix method. Similar as prefix method, but with one more twist - gaps at query end and start are
52
+ * not penalized. What that means is that deleting elements from the start and end of second sequence is "free"!
53
+ * For example, if we had ACT and CGACTGAC, edit distance would be 0, because removing CG from the start
54
+ * and GAC from the end of second sequence is "free" and does not count into total edit distance.
55
+ * This method is appropriate when you want to find out how well first sequence fits at any part of
56
+ * second sequence.
57
+ * For example, if your second sequence was a long text and your first sequence was a sentence from that text,
58
+ * but slightly scrambled, you could use this method to discover how scrambled it is and where it fits in
59
+ * that text. In bioinformatics, this method is appropriate for aligning read to a sequence.
60
+ */
61
+ EDLIB_MODE_HW
62
+ } EdlibAlignMode;
63
+
64
+ /**
65
+ * Alignment tasks - what do you want Edlib to do?
66
+ */
67
+ typedef enum {
68
+ EDLIB_TASK_DISTANCE, //!< Find edit distance and end locations.
69
+ EDLIB_TASK_LOC, //!< Find edit distance, end locations and start locations.
70
+ EDLIB_TASK_PATH //!< Find edit distance, end locations and start locations and alignment path.
71
+ } EdlibAlignTask;
72
+
73
+ /**
74
+ * Describes cigar format.
75
+ * @see http://samtools.github.io/hts-specs/SAMv1.pdf
76
+ * @see http://drive5.com/usearch/manual/cigar.html
77
+ */
78
+ typedef enum {
79
+ EDLIB_CIGAR_STANDARD, //!< Match: 'M', Insertion: 'I', Deletion: 'D', Mismatch: 'M'.
80
+ EDLIB_CIGAR_EXTENDED //!< Match: '=', Insertion: 'I', Deletion: 'D', Mismatch: 'X'.
81
+ } EdlibCigarFormat;
82
+
83
+ // Edit operations.
84
+ #define EDLIB_EDOP_MATCH 0 //!< Match.
85
+ #define EDLIB_EDOP_INSERT 1 //!< Insertion to target = deletion from query.
86
+ #define EDLIB_EDOP_DELETE 2 //!< Deletion from target = insertion to query.
87
+ #define EDLIB_EDOP_MISMATCH 3 //!< Mismatch.
88
+
89
+ /**
90
+ * @brief Defines two given characters as equal.
91
+ */
92
+ typedef struct {
93
+ char first;
94
+ char second;
95
+ } EdlibEqualityPair;
96
+
97
+ /**
98
+ * @brief Configuration object for edlibAlign() function.
99
+ */
100
+ typedef struct {
101
+ /**
102
+ * Set k to non-negative value to tell edlib that edit distance is not larger than k.
103
+ * Smaller k can significantly improve speed of computation.
104
+ * If edit distance is larger than k, edlib will set edit distance to -1.
105
+ * Set k to negative value and edlib will internally auto-adjust k until score is found.
106
+ */
107
+ int k;
108
+
109
+ /**
110
+ * Alignment method.
111
+ * EDLIB_MODE_NW: global (Needleman-Wunsch)
112
+ * EDLIB_MODE_SHW: prefix. Gap after query is not penalized.
113
+ * EDLIB_MODE_HW: infix. Gaps before and after query are not penalized.
114
+ */
115
+ EdlibAlignMode mode;
116
+
117
+ /**
118
+ * Alignment task - tells Edlib what to calculate. Less to calculate, faster it is.
119
+ * EDLIB_TASK_DISTANCE - find edit distance and end locations of optimal alignment paths in target.
120
+ * EDLIB_TASK_LOC - find edit distance and start and end locations of optimal alignment paths in target.
121
+ * EDLIB_TASK_PATH - find edit distance, alignment path (and start and end locations of it in target).
122
+ */
123
+ EdlibAlignTask task;
124
+
125
+ /**
126
+ * List of pairs of characters, where each pair defines two characters as equal.
127
+ * This way you can extend edlib's definition of equality (which is that each character is equal only
128
+ * to itself).
129
+ * This can be useful if you have some wildcard characters that should match multiple other characters,
130
+ * or e.g. if you want edlib to be case insensitive.
131
+ * Can be set to NULL if there are none.
132
+ */
133
+ const EdlibEqualityPair* additionalEqualities;
134
+
135
+ /**
136
+ * Number of additional equalities, which is non-negative number.
137
+ * 0 if there are none.
138
+ */
139
+ int additionalEqualitiesLength;
140
+ } EdlibAlignConfig;
141
+
142
+ /**
143
+ * Helper method for easy construction of configuration object.
144
+ * @return Configuration object filled with given parameters.
145
+ */
146
+ EDLIB_API EdlibAlignConfig edlibNewAlignConfig(
147
+ int k, EdlibAlignMode mode, EdlibAlignTask task,
148
+ const EdlibEqualityPair* additionalEqualities,
149
+ int additionalEqualitiesLength
150
+ );
151
+
152
+ /**
153
+ * @return Default configuration object, with following defaults:
154
+ * k = -1, mode = EDLIB_MODE_NW, task = EDLIB_TASK_DISTANCE, no additional equalities.
155
+ */
156
+ EDLIB_API EdlibAlignConfig edlibDefaultAlignConfig(void);
157
+
158
+
159
+ /**
160
+ * Container for results of alignment done by edlibAlign() function.
161
+ */
162
+ typedef struct {
163
+ /**
164
+ * EDLIB_STATUS_OK or EDLIB_STATUS_ERROR. If error, all other fields will have undefined values.
165
+ */
166
+ int status;
167
+
168
+ /**
169
+ * -1 if k is non-negative and edit distance is larger than k.
170
+ */
171
+ int editDistance;
172
+
173
+ /**
174
+ * Array of zero-based positions in target where optimal alignment paths end.
175
+ * If gap after query is penalized, gap counts as part of query (NW), otherwise not.
176
+ * Set to NULL if edit distance is larger than k.
177
+ * If you do not free whole result object using edlibFreeAlignResult(), do not forget to use free().
178
+ */
179
+ int* endLocations;
180
+
181
+ /**
182
+ * Array of zero-based positions in target where optimal alignment paths start,
183
+ * they correspond to endLocations.
184
+ * If gap before query is penalized, gap counts as part of query (NW), otherwise not.
185
+ * Set to NULL if not calculated or if edit distance is larger than k.
186
+ * If you do not free whole result object using edlibFreeAlignResult(), do not forget to use free().
187
+ */
188
+ int* startLocations;
189
+
190
+ /**
191
+ * Number of end (and start) locations.
192
+ */
193
+ int numLocations;
194
+
195
+ /**
196
+ * Alignment is found for first pair of start and end locations.
197
+ * Set to NULL if not calculated.
198
+ * Alignment is sequence of numbers: 0, 1, 2, 3.
199
+ * 0 stands for match.
200
+ * 1 stands for insertion to target.
201
+ * 2 stands for insertion to query.
202
+ * 3 stands for mismatch.
203
+ * Alignment aligns query to target from begining of query till end of query.
204
+ * If gaps are not penalized, they are not in alignment.
205
+ * If you do not free whole result object using edlibFreeAlignResult(), do not forget to use free().
206
+ */
207
+ unsigned char* alignment;
208
+
209
+ /**
210
+ * Length of alignment.
211
+ */
212
+ int alignmentLength;
213
+
214
+ /**
215
+ * Number of different characters in query and target together.
216
+ */
217
+ int alphabetLength;
218
+ } EdlibAlignResult;
219
+
220
+ /**
221
+ * Frees memory in EdlibAlignResult that was allocated by edlib.
222
+ * If you do not use it, make sure to free needed members manually using free().
223
+ */
224
+ EDLIB_API void edlibFreeAlignResult(EdlibAlignResult result);
225
+
226
+
227
+ /**
228
+ * Aligns two sequences (query and target) using edit distance (levenshtein distance).
229
+ * Through config parameter, this function supports different alignment methods (global, prefix, infix),
230
+ * as well as different modes of search (tasks).
231
+ * It always returns edit distance and end locations of optimal alignment in target.
232
+ * It optionally returns start locations of optimal alignment in target and alignment path,
233
+ * if you choose appropriate tasks.
234
+ * @param [in] query First sequence.
235
+ * @param [in] queryLength Number of characters in first sequence.
236
+ * @param [in] target Second sequence.
237
+ * @param [in] targetLength Number of characters in second sequence.
238
+ * @param [in] config Additional alignment parameters, like alignment method and wanted results.
239
+ * @return Result of alignment, which can contain edit distance, start and end locations and alignment path.
240
+ * Make sure to clean up the object using edlibFreeAlignResult() or by manually freeing needed members.
241
+ */
242
+ EDLIB_API EdlibAlignResult edlibAlign(
243
+ const char* query, int queryLength,
244
+ const char* target, int targetLength,
245
+ const EdlibAlignConfig config
246
+ );
247
+
248
+
249
+ /**
250
+ * Builds cigar string from given alignment sequence.
251
+ * @param [in] alignment Alignment sequence.
252
+ * 0 stands for match.
253
+ * 1 stands for insertion to target.
254
+ * 2 stands for insertion to query.
255
+ * 3 stands for mismatch.
256
+ * @param [in] alignmentLength
257
+ * @param [in] cigarFormat Cigar will be returned in specified format.
258
+ * @return Cigar string.
259
+ * I stands for insertion.
260
+ * D stands for deletion.
261
+ * X stands for mismatch. (used only in extended format)
262
+ * = stands for match. (used only in extended format)
263
+ * M stands for (mis)match. (used only in standard format)
264
+ * String is null terminated.
265
+ * Needed memory is allocated and given pointer is set to it.
266
+ * Do not forget to free it later using free()!
267
+ */
268
+ EDLIB_API char* edlibAlignmentToCigar(
269
+ const unsigned char* alignment, int alignmentLength,
270
+ EdlibCigarFormat cigarFormat
271
+ );
272
+
273
+ #ifdef __cplusplus
274
+ }
275
+ #endif
276
+
277
+ #endif // EDLIB_H
@@ -92,9 +92,14 @@ aligner_get_mode(VALUE self)
92
92
  static VALUE
93
93
  set_mode(EdlibAlignConfig *config, VALUE mode)
94
94
  {
95
+ if (TYPE(mode) == T_SYMBOL)
96
+ {
97
+ mode = rb_String(mode);
98
+ }
95
99
  switch (TYPE(mode))
96
100
  {
97
- case T_STRING:
101
+ case T_STRING:;
102
+ rb_funcall(mode, rb_intern("upcase!"), 0);
98
103
  if (strcmp(RSTRING_PTR(mode), "NW") == 0)
99
104
  {
100
105
  config->mode = 0;
@@ -118,7 +123,7 @@ set_mode(EdlibAlignConfig *config, VALUE mode)
118
123
  {
119
124
  rb_raise(rb_eArgError, "Invalid mode");
120
125
  }
121
- config->mode = NUM2INT(mode);
126
+ config->mode = m;
122
127
  break;
123
128
  default:
124
129
  rb_raise(rb_eArgError, "Invalid mode");
@@ -159,9 +164,14 @@ aligner_get_task(VALUE self)
159
164
  static VALUE
160
165
  set_task(EdlibAlignConfig *config, VALUE task)
161
166
  {
167
+ if (TYPE(task) == T_SYMBOL)
168
+ {
169
+ task = rb_String(task);
170
+ }
162
171
  switch (TYPE(task))
163
172
  {
164
- case T_STRING:
173
+ case T_STRING:;
174
+ rb_funcall(task, rb_intern("upcase!"), 0);
165
175
  if (strcmp(RSTRING_PTR(task), "DISTANCE") == 0)
166
176
  {
167
177
  config->task = 0;
@@ -185,7 +195,7 @@ set_task(EdlibAlignConfig *config, VALUE task)
185
195
  {
186
196
  rb_raise(rb_eArgError, "Invalid task");
187
197
  }
188
- config->task = NUM2INT(task);
198
+ config->task = t;
189
199
  break;
190
200
  default:
191
201
  rb_raise(rb_eArgError, "Invalid task");
@@ -381,7 +391,7 @@ aligner_align(VALUE self, VALUE query, VALUE target)
381
391
  return hash;
382
392
  }
383
393
 
384
- void Init_edlib(void)
394
+ void Init_edlibext(void)
385
395
  {
386
396
  mEdlib = rb_define_module("Edlib");
387
397
  cAligner = rb_define_class_under(mEdlib, "Aligner", rb_cObject);
data/ext/edlib/extconf.rb CHANGED
@@ -1,6 +1,3 @@
1
- require "mkmf"
1
+ require 'mkmf'
2
2
 
3
- dir_config('edlib')
4
- if have_header('edlib.h') and have_library('edlib')
5
- create_makefile('edlib/edlib')
6
- end
3
+ create_makefile('edlib/edlibext')
data/lib/edlib/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Edlib
2
- VERSION = "0.0.1"
3
- end
2
+ VERSION = '0.0.2'
3
+ end
data/lib/edlib.rb CHANGED
@@ -1,8 +1,8 @@
1
- require_relative 'edlib/edlib'
1
+ require_relative 'edlib/edlibext'
2
2
 
3
3
  module Edlib
4
4
  class Aligner
5
- def initialize(k:-1, mode: 'NW', task: 'DISTANCE', additional_equalities: nil)
5
+ def initialize(k: -1, mode: 'NW', task: 'DISTANCE', additional_equalities: nil)
6
6
  mode = mode.to_s if mode.is_a? Symbol
7
7
  task = task.to_s if task.is_a? Symbol
8
8
  mode = mode.upcase if mode.is_a? String
@@ -10,4 +10,4 @@ module Edlib
10
10
  initialize_raw(k, mode, task, additional_equalities)
11
11
  end
12
12
  end
13
- end
13
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: edlib
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - kojix2
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2022-10-30 00:00:00.000000000 Z
11
+ date: 2022-10-31 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: 'Lightweight, super fast C/C++ library for sequence alignment using edit
14
14
  (Levenshtein) distance. '
@@ -20,7 +20,8 @@ extensions:
20
20
  extra_rdoc_files: []
21
21
  files:
22
22
  - README.md
23
- - ext/edlib/edlib.c
23
+ - ext/edlib/edlib.h
24
+ - ext/edlib/edlibext.c
24
25
  - ext/edlib/extconf.rb
25
26
  - lib/edlib.rb
26
27
  - lib/edlib/version.rb
@@ -28,7 +29,7 @@ homepage: https://github.com/kojix2/ruby-edlib
28
29
  licenses:
29
30
  - MIT
30
31
  metadata: {}
31
- post_install_message:
32
+ post_install_message:
32
33
  rdoc_options: []
33
34
  require_paths:
34
35
  - lib
@@ -44,7 +45,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
44
45
  version: '0'
45
46
  requirements: []
46
47
  rubygems_version: 3.3.7
47
- signing_key:
48
+ signing_key:
48
49
  specification_version: 4
49
50
  summary: ruby-edlib is a wrapper for edlib.
50
51
  test_files: []