mapp2g 0.1.5 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ea23c58705cef813135bf96b23383cdeca589643ff08a25ba8b95fd473e26449
4
- data.tar.gz: 7be36a76318408344b4402fafda2393ba51d6e2e36fe2c50e6ad393817ff6cbb
3
+ metadata.gz: caad7f8e3dc7a4894e8cf5d2de8bb8a706cf654896decfe09e58566bec2d4b74
4
+ data.tar.gz: 67dffdb34daf77d28ffce2c18d152344ad78b9cba6bd66f74b9c30f870e3e9ea
5
5
  SHA512:
6
- metadata.gz: cfed44714ac742bd9cc8a17169f85c691292e2fe1d7c4d62960d3babe0cd61cf59b01c0b333af063975aef50f17c99aab9f06103b662da03e00813d456f45e02
7
- data.tar.gz: c4f3711b154e3ad3d730dde9e5637061514563f49607a814c5bbf2a674de3b06a81f96926aac9d8c32bc5feedb06c7a3dab8a481a8f122b138d60530b923af7c
6
+ metadata.gz: 47a0a17673e4aebb4b8d83dbbf7239e783aa926ad698ca967fcd718f825ca8f5883f63b6cdd6df41543aaa7ff301364c7e155d64ce0693569735696f2bbf4fd3
7
+ data.tar.gz: 4f31480439323991e2adffd8bbdd75fa05a68ff50ee683eb7b196de5087f169af63caad68538c1f976b662e6bf09d2ac5c4eea544f9f4aae11fa60498efc9f79
data/README.md CHANGED
@@ -26,6 +26,9 @@ Usage: mapp2g [options]
26
26
  -h, --help show this help message and exit
27
27
  ```
28
28
 
29
+ Query sequences should be in FASTA format. Multiple sequences can be included in one file.
30
+
31
+
29
32
  (example)
30
33
  ```
31
34
  mapp2g -q human_genome.fasta -q p53.protein.fasta
@@ -37,6 +40,18 @@ Reference genomes should be formated in blastdb before running mapp2g. blastdb c
37
40
  makeblastdb -in human_genome.fasta -dbtype nucl -parse_seqids
38
41
  ```
39
42
 
43
+ ## Outputs
44
+
45
+ For each query, the following files are generated.
46
+
47
+ - query sequence in fasta
48
+ - blast output in tab-delmited format (format 6)
49
+ - exonerate full output
50
+ - exonerate alignment in gff3 format
51
+ - report.json
52
+
53
+ report.json contains all of the information above in json line format.
54
+
40
55
 
41
56
  ## License
42
57
 
data/exe/mapp2g CHANGED
@@ -1,9 +1,11 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- require "mapp2g"
3
+ #require_relative '../lib/mapp2g' # for development
4
+ require 'mapp2g'
4
5
  require 'bio'
5
6
  require 'tempfile'
6
7
  require 'optparse'
8
+ require 'json'
7
9
 
8
10
  ### Parse options
9
11
 
@@ -75,6 +77,8 @@ end
75
77
 
76
78
  ### Main
77
79
 
80
+ report_json_lines = []
81
+
78
82
  Dir.mkdir(outdir)
79
83
 
80
84
  Bio::FlatFile.open(Bio::FastaFormat, query).each_with_index do |fas, i|
@@ -83,14 +87,42 @@ Bio::FlatFile.open(Bio::FastaFormat, query).each_with_index do |fas, i|
83
87
  tf.close
84
88
  id = (i + 1).to_s
85
89
  query_file_path = "#{outdir}/#{id}.fasta"
86
- out_file_path = "#{outdir}/#{id}.exonerate.txt"
87
90
  File.open(query_file_path, "w"){|o| o.puts fas}
88
91
 
89
92
  mapper = Mapp2g::Mapper.new()
90
93
  res = mapper.run(query_file_path, genome)
91
- File.open(out_file_path, "w"){|o| o.puts res}
94
+ if res
95
+ out_file_path = "#{outdir}/#{id}.exonerate.txt"
96
+ File.open(out_file_path, "w"){|o| o.puts res[:exonerate_result]}
97
+ out_file_path = "#{outdir}/#{id}.blast.txt"
98
+ File.open(out_file_path, "w"){|o| o.puts res[:blast_result]}
99
+ gff3 = Mapp2g::ExonerateOutput.new(res[:exonerate_result]).to_gff3()
100
+ out_file_path = "#{outdir}/#{id}.exonerate.gff3"
101
+ File.open(out_file_path, "w"){|o| o.puts gff3}
102
+
103
+ report = {
104
+ "runtime_id" => id,
105
+ "query_id" => fas.entry_id,
106
+ "query_fasta" => fas.to_s,
107
+ "exonerate" => res[:exonerate_result],
108
+ "blast" => res[:blast_result],
109
+ "gff3" => gff3
110
+ }
111
+ report_json_lines << report.to_json
112
+
113
+ else
114
+ report = {
115
+ "runtime_id" => id,
116
+ "query_id" => fas.entry_id,
117
+ "query_fasta" => fas.to_s,
118
+ "exonerate" => nil,
119
+ "blast" => nil,
120
+ "gff3" => nil
121
+ }
122
+ STDERR.puts "No hit for #{fas.entry_id}"
123
+ end
92
124
 
93
- gff3 = Mapp2g::ExonerateOutput.new(res).to_gff3()
94
- out_file_path = "#{outdir}/#{id}.exonerate.gff3"
95
- File.open(out_file_path, "w"){|o| o.puts gff3}
96
125
  end
126
+
127
+ report_json = report_json_lines.join("\n")
128
+ File.open("#{outdir}/report.json", "w"){|o| o.puts report_json}
data/lib/mapp2g/mapper.rb CHANGED
@@ -4,9 +4,9 @@ module Mapp2g
4
4
 
5
5
  class Mapper
6
6
 
7
- EVALUE_DEFAULT = 1.0e-8
7
+ EVALUE_DEFAULT = 1.0e-5
8
8
  NCPU_DEFAULT = 4
9
- MAX_HSP_INTERVAL = 50000
9
+ MAX_HSP_INTERVAL = 400000
10
10
  EXTENSION = 50000
11
11
  TMPDIR_DEFAULT = Dir.tmpdir
12
12
 
@@ -15,11 +15,11 @@ module Mapp2g
15
15
 
16
16
  ## step 1: tblastn
17
17
  def exec_tblastn_to_know_rough_target_region(query, genome, evalue=EVALUE_DEFAULT, ncpu=NCPU_DEFAULT)
18
- cmd = "tblastn -db #{genome} -query #{query} -max_target_seqs 40 -soft_masking yes -seg yes -outfmt 6 -evalue #{evalue} -num_threads #{ncpu} "
18
+ cmd = "tblastn -db #{genome} -query #{query} -max_target_seqs 40 -soft_masking yes -seg yes -outfmt 6 -evalue #{evalue} -num_threads #{ncpu} -culling_limit 2"
19
19
  # puts cmd
20
20
  res = nil
21
- IO.popen(cmd){|io| res = io.read}
22
- # puts res
21
+ IO.popen(cmd){|io| res = io.read}
22
+ # STDERR.puts res
23
23
  if res == ""
24
24
  ## no hit
25
25
  return nil
@@ -27,7 +27,7 @@ module Mapp2g
27
27
  lines = []
28
28
  prev_chr = nil
29
29
  res.split(/\n/).each do |l|
30
- # p prev_chr
30
+ # p prev_chr
31
31
  a = l.chomp.split(/\t/)
32
32
  unless prev_chr
33
33
  lines << l
@@ -42,7 +42,7 @@ module Mapp2g
42
42
  a = lines.shift.chomp.split(/\t/)
43
43
  left, right = [a[8].to_i, a[9].to_i].sort
44
44
 
45
- STDERR.puts [left, right].inspect
45
+ # STDERR.puts [left, right].inspect
46
46
 
47
47
  lines.each do |l|
48
48
  a = l.chomp.split(/\t/)
@@ -57,14 +57,16 @@ module Mapp2g
57
57
  break
58
58
  end
59
59
  end
60
- # p [left, right]
60
+ # STDERR.puts [left, right].inspect
61
61
 
62
62
  top_chromosome = res.split(/\n/)[0].split(/\t/)[1]
63
63
  h = {
64
64
  :top_chromosome => top_chromosome,
65
65
  :left => left,
66
- :right => right
66
+ :right => right,
67
+ :blast_result => res
67
68
  }
69
+ # STDERR.puts h.inspect
68
70
  return h
69
71
  end
70
72
  end
@@ -117,8 +119,10 @@ module Mapp2g
117
119
  tf.close
118
120
 
119
121
  exonerate_result = exec_exonerate(query, tf.path)
120
- return exonerate_result
121
-
122
+ return {
123
+ :blast_result => hit[:blast_result],
124
+ :exonerate_result => exonerate_result
125
+ }
122
126
  else
123
127
  return nil
124
128
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Mapp2g
4
- VERSION = "0.1.5"
4
+ VERSION = "0.2.0"
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: mapp2g
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.5
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Shuji Shigenobu
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-07-15 00:00:00.000000000 Z
11
+ date: 2023-07-17 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: mapp2g is a bioinformatics software, which map and align protein sequences
14
14
  (amino acid sequences) to genome references in a splicing-aware way. mapp2g alignment