mapp2g 0.1.5 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ea23c58705cef813135bf96b23383cdeca589643ff08a25ba8b95fd473e26449
4
- data.tar.gz: 7be36a76318408344b4402fafda2393ba51d6e2e36fe2c50e6ad393817ff6cbb
3
+ metadata.gz: caad7f8e3dc7a4894e8cf5d2de8bb8a706cf654896decfe09e58566bec2d4b74
4
+ data.tar.gz: 67dffdb34daf77d28ffce2c18d152344ad78b9cba6bd66f74b9c30f870e3e9ea
5
5
  SHA512:
6
- metadata.gz: cfed44714ac742bd9cc8a17169f85c691292e2fe1d7c4d62960d3babe0cd61cf59b01c0b333af063975aef50f17c99aab9f06103b662da03e00813d456f45e02
7
- data.tar.gz: c4f3711b154e3ad3d730dde9e5637061514563f49607a814c5bbf2a674de3b06a81f96926aac9d8c32bc5feedb06c7a3dab8a481a8f122b138d60530b923af7c
6
+ metadata.gz: 47a0a17673e4aebb4b8d83dbbf7239e783aa926ad698ca967fcd718f825ca8f5883f63b6cdd6df41543aaa7ff301364c7e155d64ce0693569735696f2bbf4fd3
7
+ data.tar.gz: 4f31480439323991e2adffd8bbdd75fa05a68ff50ee683eb7b196de5087f169af63caad68538c1f976b662e6bf09d2ac5c4eea544f9f4aae11fa60498efc9f79
data/README.md CHANGED
@@ -26,6 +26,9 @@ Usage: mapp2g [options]
26
26
  -h, --help show this help message and exit
27
27
  ```
28
28
 
29
+ Query sequences should be in FASTA format. Multiple sequences can be included in one file.
30
+
31
+
29
32
  (example)
30
33
  ```
31
34
  mapp2g -q human_genome.fasta -q p53.protein.fasta
@@ -37,6 +40,18 @@ Reference genomes should be formated in blastdb before running mapp2g. blastdb c
37
40
  makeblastdb -in human_genome.fasta -dbtype nucl -parse_seqids
38
41
  ```
39
42
 
43
+ ## Outputs
44
+
45
+ For each query, the following files are generated.
46
+
47
+ - query sequence in fasta
48
+ - blast output in tab-delmited format (format 6)
49
+ - exonerate full output
50
+ - exonerate alignment in gff3 format
51
+ - report.json
52
+
53
+ report.json contains all of the information above in json line format.
54
+
40
55
 
41
56
  ## License
42
57
 
data/exe/mapp2g CHANGED
@@ -1,9 +1,11 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- require "mapp2g"
3
+ #require_relative '../lib/mapp2g' # for development
4
+ require 'mapp2g'
4
5
  require 'bio'
5
6
  require 'tempfile'
6
7
  require 'optparse'
8
+ require 'json'
7
9
 
8
10
  ### Parse options
9
11
 
@@ -75,6 +77,8 @@ end
75
77
 
76
78
  ### Main
77
79
 
80
+ report_json_lines = []
81
+
78
82
  Dir.mkdir(outdir)
79
83
 
80
84
  Bio::FlatFile.open(Bio::FastaFormat, query).each_with_index do |fas, i|
@@ -83,14 +87,42 @@ Bio::FlatFile.open(Bio::FastaFormat, query).each_with_index do |fas, i|
83
87
  tf.close
84
88
  id = (i + 1).to_s
85
89
  query_file_path = "#{outdir}/#{id}.fasta"
86
- out_file_path = "#{outdir}/#{id}.exonerate.txt"
87
90
  File.open(query_file_path, "w"){|o| o.puts fas}
88
91
 
89
92
  mapper = Mapp2g::Mapper.new()
90
93
  res = mapper.run(query_file_path, genome)
91
- File.open(out_file_path, "w"){|o| o.puts res}
94
+ if res
95
+ out_file_path = "#{outdir}/#{id}.exonerate.txt"
96
+ File.open(out_file_path, "w"){|o| o.puts res[:exonerate_result]}
97
+ out_file_path = "#{outdir}/#{id}.blast.txt"
98
+ File.open(out_file_path, "w"){|o| o.puts res[:blast_result]}
99
+ gff3 = Mapp2g::ExonerateOutput.new(res[:exonerate_result]).to_gff3()
100
+ out_file_path = "#{outdir}/#{id}.exonerate.gff3"
101
+ File.open(out_file_path, "w"){|o| o.puts gff3}
102
+
103
+ report = {
104
+ "runtime_id" => id,
105
+ "query_id" => fas.entry_id,
106
+ "query_fasta" => fas.to_s,
107
+ "exonerate" => res[:exonerate_result],
108
+ "blast" => res[:blast_result],
109
+ "gff3" => gff3
110
+ }
111
+ report_json_lines << report.to_json
112
+
113
+ else
114
+ report = {
115
+ "runtime_id" => id,
116
+ "query_id" => fas.entry_id,
117
+ "query_fasta" => fas.to_s,
118
+ "exonerate" => nil,
119
+ "blast" => nil,
120
+ "gff3" => nil
121
+ }
122
+ STDERR.puts "No hit for #{fas.entry_id}"
123
+ end
92
124
 
93
- gff3 = Mapp2g::ExonerateOutput.new(res).to_gff3()
94
- out_file_path = "#{outdir}/#{id}.exonerate.gff3"
95
- File.open(out_file_path, "w"){|o| o.puts gff3}
96
125
  end
126
+
127
+ report_json = report_json_lines.join("\n")
128
+ File.open("#{outdir}/report.json", "w"){|o| o.puts report_json}
data/lib/mapp2g/mapper.rb CHANGED
@@ -4,9 +4,9 @@ module Mapp2g
4
4
 
5
5
  class Mapper
6
6
 
7
- EVALUE_DEFAULT = 1.0e-8
7
+ EVALUE_DEFAULT = 1.0e-5
8
8
  NCPU_DEFAULT = 4
9
- MAX_HSP_INTERVAL = 50000
9
+ MAX_HSP_INTERVAL = 400000
10
10
  EXTENSION = 50000
11
11
  TMPDIR_DEFAULT = Dir.tmpdir
12
12
 
@@ -15,11 +15,11 @@ module Mapp2g
15
15
 
16
16
  ## step 1: tblastn
17
17
  def exec_tblastn_to_know_rough_target_region(query, genome, evalue=EVALUE_DEFAULT, ncpu=NCPU_DEFAULT)
18
- cmd = "tblastn -db #{genome} -query #{query} -max_target_seqs 40 -soft_masking yes -seg yes -outfmt 6 -evalue #{evalue} -num_threads #{ncpu} "
18
+ cmd = "tblastn -db #{genome} -query #{query} -max_target_seqs 40 -soft_masking yes -seg yes -outfmt 6 -evalue #{evalue} -num_threads #{ncpu} -culling_limit 2"
19
19
  # puts cmd
20
20
  res = nil
21
- IO.popen(cmd){|io| res = io.read}
22
- # puts res
21
+ IO.popen(cmd){|io| res = io.read}
22
+ # STDERR.puts res
23
23
  if res == ""
24
24
  ## no hit
25
25
  return nil
@@ -27,7 +27,7 @@ module Mapp2g
27
27
  lines = []
28
28
  prev_chr = nil
29
29
  res.split(/\n/).each do |l|
30
- # p prev_chr
30
+ # p prev_chr
31
31
  a = l.chomp.split(/\t/)
32
32
  unless prev_chr
33
33
  lines << l
@@ -42,7 +42,7 @@ module Mapp2g
42
42
  a = lines.shift.chomp.split(/\t/)
43
43
  left, right = [a[8].to_i, a[9].to_i].sort
44
44
 
45
- STDERR.puts [left, right].inspect
45
+ # STDERR.puts [left, right].inspect
46
46
 
47
47
  lines.each do |l|
48
48
  a = l.chomp.split(/\t/)
@@ -57,14 +57,16 @@ module Mapp2g
57
57
  break
58
58
  end
59
59
  end
60
- # p [left, right]
60
+ # STDERR.puts [left, right].inspect
61
61
 
62
62
  top_chromosome = res.split(/\n/)[0].split(/\t/)[1]
63
63
  h = {
64
64
  :top_chromosome => top_chromosome,
65
65
  :left => left,
66
- :right => right
66
+ :right => right,
67
+ :blast_result => res
67
68
  }
69
+ # STDERR.puts h.inspect
68
70
  return h
69
71
  end
70
72
  end
@@ -117,8 +119,10 @@ module Mapp2g
117
119
  tf.close
118
120
 
119
121
  exonerate_result = exec_exonerate(query, tf.path)
120
- return exonerate_result
121
-
122
+ return {
123
+ :blast_result => hit[:blast_result],
124
+ :exonerate_result => exonerate_result
125
+ }
122
126
  else
123
127
  return nil
124
128
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Mapp2g
4
- VERSION = "0.1.5"
4
+ VERSION = "0.2.0"
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: mapp2g
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.5
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Shuji Shigenobu
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-07-15 00:00:00.000000000 Z
11
+ date: 2023-07-17 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: mapp2g is a bioinformatics software, which map and align protein sequences
14
14
  (amino acid sequences) to genome references in a splicing-aware way. mapp2g alignment